Add vnet files.
40fcefb3yMSrZvApO9ToIi-iQwnchA tools/sv/images/xen.png
41013a83z27rKvWIxAfUBMVZ1eDCDg tools/sv/inc/script.js
40fcefb3zGC9XNBkSwTEobCoq8YClA tools/sv/inc/style.css
+41a21888_WlknVWjSxb32Fo13_ujsw tools/vnet/00README
+41a21888bOiOJc7blzRbe4MNJoaYTw tools/vnet/Makefile
+41a21888mg2k5HeiVjlQYEtJBZT4Qg tools/vnet/doc/vnet-module.txt
+41a21888cuxfT8wjCdRR6V1lqf5NtA tools/vnet/doc/vnet-xend.txt
+41a21888xEQJAIGktS6XQ4xz2TyA5g tools/vnet/examples/Makefile
+41a21888FGQhPR5LJ1GRtOSIIN3QEw tools/vnet/examples/network-vnet
+41a21888QPgKrulCfR9SY_pxZKU0KA tools/vnet/examples/vnet97.sxp
+41a21888Gm0UBs1i7HqveT7Yz0u8DQ tools/vnet/examples/vnet98.sxp
+41a21888r4oGPuGv2Lxl-thgV3H54w tools/vnet/examples/vnet99.sxp
+41a21888c9TCRlUwJS9WBjB3e9aWgg tools/vnet/vnet-module/00README
+41a21888K2ItolEkksc1MUqyTDI_Kg tools/vnet/vnet-module/Makefile
+41a21888mJsFJD7bVMm-nrnWnalGBw tools/vnet/vnet-module/Makefile-2.4
+41a21888Znze3-UCCBZ-Nxpj-bNeHA tools/vnet/vnet-module/Makefile-2.6
+41a21889fwc1judJ7DYvyEviSJ3TPg tools/vnet/vnet-module/Makefile.ver
+41a21889m_sYkdODF3j5uhMP-Guy9Q tools/vnet/vnet-module/Makefile.vnet
+41a21889bXW2lC28U6KS_s5tOJ_W9Q tools/vnet/vnet-module/esp.c
+41a21889L2MfLDsUFQxstt-0frIVmw tools/vnet/vnet-module/esp.h
+41a21889V1jOsB2JExI-XQl720WHwg tools/vnet/vnet-module/etherip.c
+41a21889IpMYbNufHMDXe2ndNw4JxA tools/vnet/vnet-module/etherip.h
+41a21889LT9TNqO2EvTFIUTujrkX9w tools/vnet/vnet-module/if_etherip.h
+41a21889PESythGZFG6kmSoOkkN2Nw tools/vnet/vnet-module/if_varp.h
+41a21889nCPEomHqOyQ4vnhEm4II4g tools/vnet/vnet-module/linux/pfkeyv2.h
+41a21889A_fw4pRmCbBfZdtRunM5Eg tools/vnet/vnet-module/random.c
+41a218899Xy2dPKSu3pkuqaqkfKMTA tools/vnet/vnet-module/random.h
+41a21889rIH5S1dv8ygdSsTGNlg0JA tools/vnet/vnet-module/sa.c
+41a218896Z4vxy6gnV9h0fWRWu0lKQ tools/vnet/vnet-module/sa.h
+41a21889qFD8BTbDpB55uVmSVDEsgw tools/vnet/vnet-module/sa_algorithm.c
+41a21889r2AwTe-OCSSVMxBzz8uDtw tools/vnet/vnet-module/sa_algorithm.h
+41a21889tvjtL7O8tMveVB8MdSKPnQ tools/vnet/vnet-module/skb_context.c
+41a21889lD_QOUz2Msd7fB5rJQzfxA tools/vnet/vnet-module/skb_context.h
+41a21889F1r1xnJamzdeuClR8MNwQg tools/vnet/vnet-module/skb_util.c
+41a21889sS4bjVqEna24sS8NpV7SRA tools/vnet/vnet-module/skb_util.h
+41a21889MDawEK3J_f_oAGnZznhG2w tools/vnet/vnet-module/tunnel.c
+41a218896TlHXpVVqF50uz_u_WMXRw tools/vnet/vnet-module/tunnel.h
+41a21889nQYbJbqrOApg_RbkwPtXGg tools/vnet/vnet-module/varp.c
+41a21889Pev5MJlqqass6CxN4mmvPw tools/vnet/vnet-module/varp.h
+41a21889GbsHHfkpA-PkOvltfEwpMA tools/vnet/vnet-module/varp_socket.c
+41a21889sknn8zd5xCJlpQbs7MvxKg tools/vnet/vnet-module/vif.c
+41a21889VsKKWpe6rcXOSLPy2FuNWQ tools/vnet/vnet-module/vif.h
+41a21889dgkOyuSTVqy7D8TPIzrUyw tools/vnet/vnet-module/vnet.c
+41a21889ocAdwk7V1nNt4iBpmYW-Mw tools/vnet/vnet-module/vnet.h
+41a21889YrTiC0ArJSGFtiaHz2j1qQ tools/vnet/vnet-module/vnet_dev.c
+41a21889rHT4vrC4VAfk7-xP_K5aBg tools/vnet/vnet-module/vnet_dev.h
+41a21889qJj6GjT2f5hMHRvPS1AW4w tools/vnet/vnet-module/vnet_ioctl.c
+41a2188a8W4xYB0LYm512agtoEv52g tools/vnet/vnet-module/vnet_ioctl.h
+41a2188aFF_1T9OgpqUjjjaCqKB8lw tools/vnet/vnetd/Makefile
+41a2188a9j84qS4CxqMLVCvyGpA93w tools/vnet/vnetd/connection.c
+41a2188atexNEami9TNVYNkRSb7Bqg tools/vnet/vnetd/connection.h
+41a2188abgYpITSrWoMGHHrM56nklw tools/vnet/vnetd/marshal.c
+41a2188aUbOi5tAYwOS4aPixo1EGwQ tools/vnet/vnetd/marshal.h
+41a2188aDJlSVB1s_st2MSWxW8kMwg tools/vnet/vnetd/select.c
+41a2188aE9LUDdSSwNT3BWVWCvGSnQ tools/vnet/vnetd/select.h
+41a2188aTbMKv_Eig12dSrBUEBl1Jg tools/vnet/vnetd/timer.c
+41a2188aIzBGqQ6DUVzCxfBsN0Q6Ww tools/vnet/vnetd/timer.h
+41a2188aIf3Xk6uvk7KzjdpOsflAEw tools/vnet/vnetd/vcache.c
+41a2188ar6_vOO3_tEJQjmFVU3409A tools/vnet/vnetd/vcache.h
+41a2188aETrGU60X9WtGhYVfU7z0Pw tools/vnet/vnetd/vnetd.c
+41a2188ahYjemudGyB7078AWMFR-0w tools/vnet/vnetd/vnetd.h
4194e861IgTabTt8HOuh143QIJFD1Q tools/x2d2/Makefile
4194e861M2gcBz4i94cQYpqzi8n6UA tools/x2d2/cntrl_con.c
4194e8612TrrMvC8ZlA4h2ZYCPWz4g tools/x2d2/minixend.c
tools/libxc/xen/*
tools/misc/miniterm/miniterm
tools/misc/xen_cpuperf
+tools/vnet/gc
+tools/vnet/gc*/*
+tools/vnet/vnet-module/.tmp_versions/*
+tools/vnet/vnet-module/.*.cmd
+tools/vnet/vnet-module/*.ko
+tools/vnet/vnet-module/vnet_module.mod.*
+tools/vnetd/vnetd
tools/web-shutdown.tap
tools/xentrace/xentrace
tools/xfrd/xfrd
--- /dev/null
+This directory contains the implementation of vnets:
+virtual private networks for virtual machines.
+See doc/ for more information and examples/ for example
+configurations.
+
+The kernel module is in vnet-module/ and the vnet forwarding
+daemon is in vnetd/. The vnetd daemon makes vnets work across
+subnets when multicast routing is not available.
+
+Mike Wray <mike.wray@hp.com>
\ No newline at end of file
--- /dev/null
+
+export LINUX_RELEASE ?=2.6
+
+all: compile
+
+compile: vnetd vnet-module
+
+gc.tar.gz:
+ wget http://www.hpl.hp.com/personal/Hans_Boehm/gc/gc_source/$@
+
+gc: gc.tar.gz
+ tar xfz gc.tar.gz
+ ln -sf gc?.? gc
+
+gc-install: gc
+ (cd gc && ./configure --prefix=`pwd`/install && make && make install)
+
+gc-clean:
+ -$(MAKE) -C gc clean
+
+gc-pristine:
+ -rm -rf gc?.? gc
+
+.PHONY: vnetd vnet-module install dist clean
+
+vnetd: gc-install
+ $(MAKE) -C vnetd
+
+vnet-module:
+ $(MAKE) -C vnet-module
+
+install: compile
+ $(MAKE) -C vnetd install
+ $(MAKE) -C vnet-module install
+ $(MAKE) -C examples install
+
+dist: $(TARGET)
+ $(MAKE) prefix=`pwd`/../../install dist=yes install
+
+clean:
+ -$(MAKE) -C vnetd clean
+ -$(MAKE) -C vnet-module clean
--- /dev/null
+Vnet Module Command Interface
+Mike Wray <mike.wray@hp.com>
+2004/09/17
+
+When insmod the vnet-module creates /proc/vnet/policy which
+can be used to control the module by writing commands into it.
+The return code from the command should be returned by close.
+
+The commands are:
+
+(vnet.add (id <id>) [(security { none | auth | conf } )] )
+
+Create the vnet with id <id> and the given security level (default none).
+Security levels:
+- none: no security
+- auth: message authentication (IPSEC hmac)
+- conf: message confidentiality (IPSEC hmac and encryption)
+
+(vnet.del (id <id>))
+
+Delete the vnet with id <id>.
+
+(vif.add (vnet <vnetid>) (vmac <macaddr>))
+
+Add the vif with MAC address <macaddr> to the vnet with id <vnetid>.
+This makes the vnet module respond to VARP requests for <macaddr>
+on vnet <vnetid>.
+
+(vif.del (vnet <vnetid>) (vmac <macaddr>))
+
+Remove the vif with MAC address <macaddr> from the vnet with id <vnetid>.
+The vnet module will stop responding to VARP for the vif.
+
+Examples:
+
+To create vnet 10 with no security:
+
+echo '(vnet.add (id 10))' > /proc/vnet/policy
+
+To create vnet 11 with message authentication:
+
+echo '(vnet.add (id 11) (security auth))' > /proc/vnet/policy
+
+To add the vif with vmac "aa:00:00:bc:34:ae" to vnet 10:
+
+echo '(vif.add (vnet 10) (vmac aa:00:00:bc:34:ae))' > /proc/vnet/policy
+
+To remove the vif from the vnet:
+
+echo '(vif.del (vnet 10) (vmac aa:00:00:bc:34:ae))' > /proc/vnet/policy
--- /dev/null
+
+Vnets: Virtual Networks for Virtual Machines
+
+Mike Wray <mike.wray@hp.com>
+
+0) Introduction
+---------------
+
+Vnets provide virtual private LANs for virtual machines.
+This is done using bridging and tunneling. A virtual interface
+on a vnet can only see other interfaces on the same vnet - it cannot
+see the real network, and the real network cannot see it either.
+
+Virtual interfaces on the same vnet can be on the same machine
+or on different machines, they can still talk. The hosting machines
+can even be on different subnets if you run vnetd to forward,
+or have multicast routing enabled.
+
+
+1) Installing vnet support
+--------------------------
+
+Assuming the code has been installed (make install in the parent directory),
+configure xend to use 'network-vnet' instead of the default 'network' to
+start up networking. This just loads the vnet module when networking starts.
+
+In /etc/xend/xend-config.sxp:
+
+Configure the network script:
+
+(network-script network-vnet)
+
+Restart xend.
+
+2) Creating vnets
+-----------------
+
+Xend already implements commands to add/remove vnets and
+bridge to them. To add a vnet use
+
+xm call vnet_add <vnet config file>
+
+For example, if vnet97.sxp contains:
+
+(vnet (id 97) (bridge vnet97) (vnetif vnetif97) (security none))
+
+do
+
+xm call vnet_add vnet97.sxp
+
+This will define a vnet with id 97 and no security. The bridge for the
+vnet is called vnet97 and the virtual interface for it is vnetif97.
+To add an interface on a vm to this vnet simply set its bridge to vnet97
+in its configuration.
+
+In Python:
+
+vif="bridge=vnet97"
+
+In sxp:
+
+(dev (vif (mac aa:00:00:01:02:03) (bridge vnet97)))
+
+Once configured, vnets are persistent in the xend database.
+To remove a vnet use
+
+xm call vnet_delete <vnet id>
+
+To list vnets use
+
+xm call vnets
+
+To get information on a vnet id use
+
+xm call vnet <vnet id>
+
+3) Troubleshooting
+------------------
+
+The vnet module should appear in 'lsmod'.
+If a vnet has been configured it should appear in the output of 'xm call vnets'.
+Its bridge and interface should appear in 'ifconfig'.
+It should also show in 'brctl show', with its attached interfaces.
+
+You can 'see into' a vnet from dom0 if you put an IP address on the bridge.
+For example, if you have vnet97 with a vm with ip addr 10.0.0.12 on it,
+then
+
+ifconfig vnet97 10.0.0.20 up
+
+should let you ping 10.0.0.12 via the vnet97 bridge.
+
+4) Examples
+-----------
+
+Here's the full config for a vm on vnet 97, using ip addr 10.0.0.12:
+
+(vm
+ (name dom12)
+ (memory '64')
+ (cpu '1')
+ (console '8502')
+ (image
+ (linux
+ (kernel /boot/vmlinuz-2.6.9-xenU)
+ (ip 10.0.0.12:1.2.3.4::::eth0:off)
+ (root /dev/hda1)
+ (args 'rw fastboot 4')
+ )
+ )
+ (device (vbd (uname phy:hda2) (dev hda1) (mode w)))
+ (device (vif (mac aa:00:00:11:00:12) (bridge vnet97)))
+)
+
+If you run another vm on the same vnet:
+
+(vm
+ (name dom11)
+ (memory '64')
+ (cpu '1')
+ (console '8501')
+ (image
+ (linux
+ (kernel /boot/vmlinuz-2.6.9-xenU)
+ (ip 10.0.0.11:1.2.3.4::::eth0:off)
+ (root /dev/hda1)
+ (args 'rw fastboot 4')
+ )
+ )
+ (device (vbd (uname phy:hda3) (dev hda1) (mode w)))
+ (device (vif (mac aa:00:00:11:00:11) (bridge vnet97)))
+)
+
+the vms should be able to talk over the vnet. Check with ping.
+If they are both on the same machine the connection will simply
+be the vnet97 bridge, if they are on separate machines their
+packets will be tunneled in etherip. They should be able to
+see each other, but not the real network.
+
+
--- /dev/null
+# -*- mode: Makefile; -*-
+#============================================================================
+
+XEN_SCRIPT_DIR:=/etc/xen/scripts
+
+all:
+
+install:
+ install -m 0755 -d $(prefix)$(XEN_SCRIPT_DIR)
+ install -m 0554 network-vnet $(prefix)$(XEN_SCRIPT_DIR)
+
+clean:
\ No newline at end of file
--- /dev/null
+#!/bin/sh
+#============================================================================
+# Default Xen network start/stop script.
+# Xend calls a network script when it starts.
+# The script name to use is defined in /etc/xen/xend-config.sxp
+# in the network-script field.
+#
+# This script creates a bridge (default xen-br0), adds a device
+# (default eth0) to it, copies the IP addresses from the device
+# to the bridge and adjusts the routes accordingly.
+#
+# If all goes well, this should ensure that networking stays up.
+# However, some configurations are upset by this, especially
+# NFS roots. If the bridged setup does not meet your needs,
+# configure a different script, for example using routing instead.
+#
+# Usage:
+#
+# network (start|stop|status) {VAR=VAL}*
+#
+# Vars:
+#
+# bridge The bridge to use (default xen-br0).
+# netdev The interface to add to the bridge (default eth0).
+# antispoof Whether to use iptables to prevent spoofing (default yes).
+#
+# start:
+# Creates the bridge and enslaves netdev to it.
+# Copies the IP addresses from netdev to the bridge.
+# Deletes the routes to netdev and adds them on bridge.
+#
+# stop:
+# Removes netdev from the bridge.
+# Deletes the routes to bridge and adds them to netdev.
+#
+# status:
+# Print ifconfig for netdev and bridge.
+# Print routes.
+#
+#============================================================================
+
+# Exit if anything goes wrong.
+set -e
+
+# First arg is the operation.
+OP=$1
+shift
+
+# Pull variables in args in to environment.
+for arg ; do export "${arg}" ; done
+
+bridge=${bridge:-xen-br0}
+netdev=${netdev:-eth0}
+antispoof=${antispoof:-yes}
+
+echo "network $OP bridge=$bridge netdev=$netdev antispoof=$antispoof"
+
+# Usage: transfer_addrs src dst
+# Copy all IP addresses (including aliases) from device $src to device $dst.
+transfer_addrs () {
+ local src=$1
+ local dst=$2
+ # Don't bother if $dst already has IP addresses.
+ if ip addr show dev ${dst} | egrep -q '^ *inet' ; then
+ return
+ fi
+ # Address lines start with 'inet' and have the device in them.
+ # Replace 'inet' with 'ip addr add' and change the device name $src
+ # to 'dev $src'. Remove netmask as we'll add routes later.
+ ip addr show dev ${src} | egrep '^ *inet' | sed -e "
+s/inet/ip addr add/
+s@\([0-9]\+\.[0-9]\+\.[0-9]\+\.[0-9]\+\)/[0-9]\+@\1@
+s/${src}/dev ${dst}/
+" | sh -e
+}
+
+# Usage: transfer_routes src dst
+# Get all IP routes to device $src, delete them, and
+# add the same routes to device $dst.
+# The original routes have to be deleted, otherwise adding them
+# for $dst fails (duplicate routes).
+transfer_routes () {
+ local src=$1
+ local dst=$2
+ # List all routes and grep the ones with $src in.
+ # Stick 'ip route del' on the front to delete.
+ # Change $src to $dst and use 'ip route add' to add.
+ ip route list | grep ${src} | sed -e "
+h
+s/^/ip route del /
+P
+g
+s/${src}/${dst}/
+s/^/ip route add /
+P
+d
+" | sh -e
+}
+
+# Usage: create_bridge dev bridge
+# Create bridge $bridge and add device $dev to it.
+create_bridge () {
+ local dev=$1
+ local bridge=$2
+
+ # Don't create the bridge if it already exists.
+ if ! brctl show | grep -q ${bridge} ; then
+ brctl addbr ${bridge}
+ brctl stp ${bridge} off
+ brctl setfd ${bridge} 0
+ fi
+ ifconfig ${bridge} up
+}
+
+# Usage: antispoofing dev bridge
+# Set the default forwarding policy for $dev to drop.
+# Allow forwarding to the bridge.
+antispoofing () {
+ local dev=$1
+ local bridge=$2
+
+ iptables -P FORWARD DROP
+ iptables -A FORWARD -m physdev --physdev-in ${dev} -j ACCEPT
+}
+
+# Usage: show_status dev bridge
+# Print ifconfig and routes.
+show_status () {
+ local dev=$1
+ local bridge=$2
+
+ echo '============================================================'
+ ifconfig ${dev}
+ ifconfig ${bridge}
+ echo ' '
+ ip route list
+ echo ' '
+ route -n
+ echo '============================================================'
+}
+
+# Insert the vnet module if it can be found and
+# it's not already there.
+vnet_insert () {
+ local module="vnet_module"
+ local mod_dir=/lib/modules/$(uname -r)/kernel
+ local mod_path="${mod_dir}/${module}"
+ local mod_obj=""
+
+ for ext in ".o" ".ko" ; do
+ f=${mod_path}${ext}
+ if [ -f ${f} ] ; then
+ mod_obj=$f
+ break
+ fi
+ done
+ if [ "${mod_obj}" == "" ] ; then
+ return
+ fi
+ if lsmod | grep -q ${module} ; then
+ echo "VNET: ${module} loaded"
+ else
+ echo "VNET: Loading ${module}..."
+ insmod ${mod_obj}
+ fi
+}
+
+op_start () {
+ if [ "${bridge}" == "null" ] ; then
+ return
+ fi
+ # Create the bridge and give it the interface IP addresses.
+ # Move the interface routes onto the bridge.
+ create_bridge ${netdev} ${bridge}
+ transfer_addrs ${netdev} ${bridge}
+ transfer_routes ${netdev} ${bridge}
+ # Don't add $dev to $bridge if it's already on a bridge.
+ if ! brctl show | grep -q ${netdev} ; then
+ brctl addif ${bridge} ${netdev}
+ fi
+
+ if [ ${antispoof} == 'yes' ] ; then
+ antispoofing ${netdev} ${bridge}
+ fi
+
+ vnet_insert
+}
+
+op_stop () {
+ if [ "${bridge}" == "null" ] ; then
+ return
+ fi
+ # Remove the interface from the bridge.
+ # Move the routes back to the interface.
+ brctl delif ${bridge} ${netdev}
+ transfer_routes ${bridge} ${netdev}
+
+ # It's not our place to be enabling forwarding...
+}
+
+case ${OP} in
+ start)
+ op_start
+ ;;
+
+ stop)
+ op_stop
+ ;;
+
+ status)
+ show_status ${netdev} ${bridge}
+ ;;
+
+ *)
+ echo 'Unknown command: ' ${OP}
+ echo 'Valid commands are: start, stop, status'
+ exit 1
+esac
--- /dev/null
+# Vnet configuration for a vnet with id 97 and no security.
+# Configure using 'xm call vnet_add vnet97.sxp'.
+(vnet (id 97) (bridge vnet97) (vnetif vnetif97) (security none))
--- /dev/null
+# Vnet configuration for a vnet with id 98 and message authentication.
+# Configure using 'xm call vnet_add vnet98.sxp'.
+(vnet (id 98) (bridge vnet98) (vnetif vnetif98) (security auth))
--- /dev/null
+# Vnet configuration for a vnet with id 99 and message confidentiality.
+# Configure using 'xm call vnet_add vnet99.sxp'.
+(vnet (id 99) (bridge vnet99) (vnetif vnetif99) (security conf))
--- /dev/null
+Vnet module for network virtualization.
+Mike Wray <mike.wray@hp.com>
+
+*) Compiling
+The vnet module can be compiled for 2.4 or 2.6 series kernels.
+The makefiles use the following variables, which
+can be set in your env or on the make command line:
+
+LINUX_RELEASE: linux release to compile for, 2.4 (default), or 2.6.
+XENO_ROOT: root of the xen tree containing kernel source. Default '..'.
+ROOT: root path to install in, default is XENO_ROOT/install.
+ Set to '/' to install relative to filesystem root.
+KERNEL_VERSION: kernel version, default got from XENO_ROOT.
+KERNEL_MINOR: kernel minor version, default -xen0.
+KERNEL_SRC: path to kernel source, default linux-<VERSION> under XENO_ROOT.
+
+*) For 2.4 kernel
+
+To compile from scratch:
+
+make clean
+make
+
+This will build vnet_module.o in the current directory.
+To install the module use
+
+make install
+
+*) For 2.6 kernel
+
+To compile from scratch:
+
+make clean
+make LINUX_RELEASE=2.6
+
+This will build vnet_module.ko in the current directory.
+To install the module use
+
+make LINUX_RELEASE=2.6 install
+
+
--- /dev/null
+# -*- mode: Makefile; -*-
+#============================================================================
+#
+# Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free software Foundation, Inc.,
+# 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+#============================================================================
+
+#============================================================================
+ifeq ($(src),)
+LINUX_RELEASE ?=2.6
+
+include Makefile-$(LINUX_RELEASE)
+
+#============================================================================
+else
+#============================================================================
+# This section is for the 2.6 kbuild.
+
+#$(warning KBUILD_EXTMOD $(KBUILD_EXTMOD))
+#$(warning src $(src))
+#$(warning obj $(obj))
+
+include $(src)/Makefile.vnet
+
+obj-m = vnet_module.o
+vnet_module-objs = $(VNET_OBJ)
+vnet_module-objs += $(VNET_LIB_OBJ)
+
+#----------------------------------------------------------------------------
+# The fancy stuff in the kernel build defeats 'vpath %.c' so we can't
+# use that to get the lib files compiled.
+# Setup explicit rules for them using the kbuild C compile rule.
+
+# File names in the lib dir.
+remote_srcs = $(foreach file,$(VNET_LIB_SRC),$(LIB_DIR)/$(file))
+
+# Equivalent file names here.
+local_srcs = $(foreach file,$(VNET_LIB_SRC),$(src)/$(file))
+
+# Objects for the local names.
+local_objs = $(local_srcs:.c=.o)
+
+# Make the local objects depend on compiling the remote sources.
+$(local_objs): $(src)/%.o: $(LIB_DIR)/%.c
+ $(call if_changed_rule,cc_o_c)
+#----------------------------------------------------------------------------
+
+vpath %.h $(LIB_DIR)
+EXTRA_CFLAGS += -I $(LIB_DIR)
+EXTRA_CFLAGS += -I $(src)
+
+endif
+#============================================================================
+
--- /dev/null
+# -*- mode: Makefile; -*-
+#============================================================================
+#
+# Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free software Foundation, Inc.,
+# 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+#============================================================================
+
+#============================================================================
+# Vnet module makefile for 2.4 series kernels.
+
+include Makefile.ver
+
+KERNEL_MODULE := vnet_module.o
+
+CONFIG_MODVERSIONS := $(shell grep 'CONFIG_MODVERSIONS=y' $(KERNEL_SRC)/.config && echo 1 || echo 0)
+
+include Makefile.vnet
+
+VNET_OBJ += $(VNET_LIB_OBJ)
+
+#----------------------------------------------------------------------------
+
+vpath %.h $(KERNEL_SRC)/include
+INCLUDES+= -I $(KERNEL_SRC)/include
+
+vpath %.h $(LIB_DIR)
+vpath %.c $(LIB_DIR)
+INCLUDES += -I $(LIB_DIR)
+
+INCLUDES+= -I .
+
+#----------------------------------------------------------------------------
+
+CPPFLAGS += -D__KERNEL__
+CPPFLAGS += -DMODULE
+
+ifeq ($(CONFIG_MODVERSIONS), 1)
+CPPFLAGS += -DMODVERSIONS
+CPPFLAGS += -include $(KERNEL_SRC)/include/linux/modversions.h
+endif
+
+CPPFLAGS += $(INCLUDES)
+
+CFLAGS += -Wall
+CFLAGS += -Wstrict-prototypes
+CFLAGS += -Wno-trigraphs
+CFLAGS += -Wno-unused-function
+CFLAGS += -Wno-unused-parameter
+
+CFLAGS += -g
+CFLAGS += -O2
+CFLAGS += -fno-strict-aliasing
+CFLAGS += -fno-common
+#CFLAGS += -fomit-frame-pointer
+
+# Dependencies. Gcc generates them for us.
+CFLAGS += -Wp,-MD,.$(@F).d
+VNET_DEP = .*.d
+#----------------------------------------------------------------------------
+
+.PHONY: all
+all: module
+
+.PHONY: module modules
+module modules: $(KERNEL_MODULE)
+
+$(KERNEL_MODULE): $(VNET_OBJ)
+ $(LD) -r -o $@ $^
+
+.PHONY: install install-module modules_install
+install install-module modules_install: module
+ install -m 0755 -d $(prefix)$(KERNEL_MODULE_DIR)
+ install -m 0554 $(KERNEL_MODULE) $(prefix)$(KERNEL_MODULE_DIR)
+
+TAGS:
+ etags *.c *.h
+
+.PHONY: clean
+clean:
+ @rm -f *.a *.o *.ko *~
+ @rm -f $(VNET_DEP) .*.cmd *.mod.?
+ @rm -rf .tmp_versions
+
+-include $(VNET_DEP)
--- /dev/null
+# -*- mode: Makefile; -*-
+#============================================================================
+#
+# Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free software Foundation, Inc.,
+# 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+#============================================================================
+
+#============================================================================
+# Vnet module makefile for 2.6 series kernels.
+
+LINUX_RELEASE ?= 2.6
+include Makefile.ver
+
+KERNEL_MODULE = vnet_module.ko
+
+#----------------------------------------------------------------------------
+#export KBUILD_VERBOSE=1
+
+.PHONY: all
+all: module
+
+.PHONY: module
+module modules:
+ $(MAKE) -C $(KERNEL_SRC) M=`pwd` modules
+
+.PHONY: install install-module modules_install
+install install-module modules_install: module
+ install -m 0755 -d $(prefix)$(KERNEL_MODULE_DIR)
+ install -m 0554 $(KERNEL_MODULE) $(prefix)$(KERNEL_MODULE_DIR)
+
+.PHONY: clean
+clean:
+ @$(MAKE) -C $(KERNEL_SRC) M=$(PWD) clean
+ @rm -f *.a *.o *.ko *~ .*.d .*.cmd *.mod.?
+
+TAGS:
+ etags *.c *.h
+
--- /dev/null
+# -*- mode: Makefile; -*-
+#============================================================================
+#
+# Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free software Foundation, Inc.,
+# 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+#============================================================================
+
+#----------------------------------------------------------------------------
+# Xeno/xen.
+
+# Root of xen tree.
+XEN_ROOT ?=../../..
+
+# Path to relativize the install. Set to /
+# to install relative to filesystem root.
+prefix ?=$(XEN_ROOT)/install/
+#----------------------------------------------------------------------------
+
+LINUX_RELEASE ?=2.6
+KERNEL_MINOR ?=-xen0
+
+LINUX_VERSION ?= $(shell ( /bin/ls -ld $(XEN_ROOT)/linux-$(LINUX_RELEASE).*-xen-sparse ) 2>/dev/null | \
+ sed -e 's!^.*linux-\(.\+\)-xen-sparse!\1!' )
+
+ifeq ($(LINUX_VERSION),)
+$(error Kernel source for linux $(LINUX_RELEASE) not found)
+endif
+
+KERNEL_VERSION =$(LINUX_VERSION)$(KERNEL_MINOR)
+
+KERNEL_SRC ?= $(XEN_ROOT)/linux-$(KERNEL_VERSION)
+
+KERNEL_MODULE_DIR = /lib/modules/$(KERNEL_VERSION)/kernel
+
+#$(warning KERNEL_VERSION $(KERNEL_VERSION))
+#$(warning KERNEL_SRC $(KERNEL_SRC))
--- /dev/null
+# -*- mode: Makefile; -*-
+#============================================================================
+#
+# Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by the
+# Free Software Foundation; either version 2 of the License, or (at your
+# option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+# for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free software Foundation, Inc.,
+# 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+#============================================================================
+
+ifeq ($(src),)
+SRC_DIR=
+else
+SRC_DIR=$(src)/
+endif
+
+LIB_DIR := $(SRC_DIR)../../libxutil
+
+VNET_SRC :=
+VNET_SRC += esp.c
+VNET_SRC += etherip.c
+VNET_SRC += random.c
+VNET_SRC += sa_algorithm.c
+VNET_SRC += sa.c
+VNET_SRC += skb_context.c
+VNET_SRC += skb_util.c
+VNET_SRC += tunnel.c
+VNET_SRC += varp.c
+VNET_SRC += varp_socket.c
+VNET_SRC += vif.c
+VNET_SRC += vnet.c
+VNET_SRC += vnet_dev.c
+VNET_SRC += vnet_ioctl.c
+
+VNET_LIB_SRC += allocate.c
+VNET_LIB_SRC += enum.c
+VNET_LIB_SRC += hash_table.c
+VNET_LIB_SRC += iostream.c
+VNET_LIB_SRC += kernel_stream.c
+VNET_LIB_SRC += sxpr.c
+VNET_LIB_SRC += sxpr_parser.c
+VNET_LIB_SRC += sys_net.c
+VNET_LIB_SRC += sys_string.c
+
+VNET_OBJ := $(VNET_SRC:.c=.o)
+VNET_LIB_OBJ := $(VNET_LIB_SRC:.c=.o)
+
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <asm/uaccess.h>
+
+#include <linux/init.h>
+
+#include <linux/version.h>
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+
+#include <linux/if_ether.h>
+#include <linux/icmp.h>
+
+#include <asm/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/random.h>
+
+#include <esp.h>
+#include <sa.h>
+#include <sa_algorithm.h>
+#include <tunnel.h>
+#include <vnet.h>
+#include <skb_util.h>
+
+static const int DEBUG_ICV = 0;
+
+#define MODULE_NAME "IPSEC"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+/* Outgoing packet: [ eth | ip | data ]
+ * After etherip: [ eth2 | ip2 | ethip | eth | ip | data ]
+ * After esp : [ eth2 | ip2 | esp | {ethip | eth | ip | data} | pad | icv ]
+ * ^ +
+ * The curly braces { ... } denote encryption.
+ * The esp header includes the fixed esp headers and the iv (variable size).
+ * The point marked ^ does not move. To the left is in the header, to the right
+ * is in the frag. Remember that all outgoing skbs (from domains) have 1 frag.
+ * Data after + is added by esp, using an extra frag.
+ *
+ * Incoming as above.
+ * After decrypt: [ eth2 | ip2 | esp | ethip | eth | ip | data | pad | icv ]
+ * Trim tail: [ eth2 | ip2 | esp | ethip | eth | ip | data ]
+ * Drop hdr: [ eth2 | ip2 | ethip | eth | ip | data ]
+ * ^
+ * The point marked ^ does not move. Incoming skbs are linear (no frags).
+ * The tail is trimmed by adjusting skb->tail and len.
+ * The esp hdr is dropped by using memmove to move the headers and
+ * adjusting the skb pointers.
+ *
+ * todo: Now this code is in linux we can't assume 1 frag for outbound skbs,
+ * or (maybe) that memmove is safe on inbound.
+ */
+
+/** Round n up to a multiple of block.
+ * If block is less than 2 does nothing.
+ * Otherwise assume block is a power of 2.
+ *
+ * @param n to round up
+ * @param block size to round to a multiple of
+ * @return rounded value
+ */
+static inline int roundup(int n, int block){
+ if(block <= 1) return n;
+ block--;
+ return (n + block) & ~block;
+}
+
+/** Check if n is a multiple of block.
+ * If block is less than 2 returns 1.
+ * Otherwise assumes block is a power of 2.
+ *
+ * @param n to check
+ * @param block block size
+ * @return 1 if a multiple, 0 otherwise
+ */
+static inline int multipleof(int n, int block){
+ if(block <= 1) return 1;
+ block--;
+ return !(n & block);
+}
+
+/** Convert from bits to bytes.
+ *
+ * @param n number of bits
+ * @return number of bytes
+ */
+static inline int bits_to_bytes(int n){
+ return n / 8;
+}
+
+
+/** Insert esp padding at the end of an skb.
+ * Inserts padding bytes, number of padding bytes, protocol number.
+ *
+ * @param skb skb
+ * @param offset offset from skb end to where padding should end
+ * @param extra_n total amount of padding
+ * @param protocol protocol number (from original ip hdr)
+ * @return 0 on success, error code otherwise
+ */
+static int esp_sa_pad(struct sk_buff *skb, int offset, int extra_n,
+ unsigned char protocol){
+ int err;
+ char *data;
+ int pad_n = extra_n - ESP_PAD_N;
+ int i;
+ char buf[extra_n];
+
+ data = buf;
+ for(i = 1; i <= pad_n; i++){
+ *data++ = i;
+ }
+ *data++ = pad_n;
+ *data++ = protocol;
+ err = skb_put_bits(skb, skb->len - offset - extra_n, buf, extra_n);
+ return err;
+}
+
+/** Encrypt skb. Skips esp header and iv.
+ * Assumes skb->data points at esp header.
+ *
+ * @param esp esp state
+ * @parm esph esp header
+ * @param skb packet
+ * @param head_n size of esp header and iv
+ * @param iv_n size of iv
+ * @param text_n size of ciphertext
+ * @return 0 on success, error code otherwise
+ */
+static int esp_sa_encrypt(ESPState *esp, ESPHdr *esph, struct sk_buff *skb,
+ int head_n, int iv_n, int text_n){
+ int err = 0;
+ int sg_n = skb_shinfo(skb)->nr_frags + 1;
+ struct scatterlist sg[sg_n];
+
+ err = skb_scatterlist(skb, sg, &sg_n, head_n, text_n);
+ if(err) goto exit;
+ if(iv_n){
+ crypto_cipher_set_iv(esp->cipher.tfm, esp->cipher.iv, iv_n);
+ }
+ crypto_cipher_encrypt(esp->cipher.tfm, sg, sg, text_n);
+ if(iv_n){
+ memcpy(esph->data, esp->cipher.iv, iv_n);
+ crypto_cipher_get_iv(esp->cipher.tfm, esp->cipher.iv, iv_n);
+ }
+ exit:
+ return err;
+}
+
+/** Decrypt skb. Skips esp header and iv.
+ * Assumes skb->data points at esp header.
+ *
+ * @param esp esp state
+ * @parm esph esp header
+ * @param skb packet
+ * @param head_n size of esp header and iv
+ * @param iv_n size of iv
+ * @param text_n size of ciphertext
+ * @return 0 on success, error code otherwise
+ */
+static int esp_sa_decrypt(ESPState *esp, ESPHdr *esph, struct sk_buff *skb,
+ int head_n, int iv_n, int text_n){
+ int err = 0;
+ int sg_n = skb_shinfo(skb)->nr_frags + 1;
+ struct scatterlist sg[sg_n];
+
+ err = skb_scatterlist(skb, sg, &sg_n, head_n, text_n);
+ if(err) goto exit;
+ if(iv_n){
+ crypto_cipher_set_iv(esp->cipher.tfm, esph->data, iv_n);
+ }
+ crypto_cipher_decrypt(esp->cipher.tfm, sg, sg, text_n);
+ exit:
+ return err;
+}
+
+/** Compute icv. Includes esp header, iv and ciphertext.
+ * Assumes skb->data points at esp header.
+ *
+ * @param esp esp state
+ * @param skb packet
+ * @param digest_n number of bytes to digest
+ * @param icv_n size of icv
+ * @return 0 on success, error code otherwise
+ */
+static int esp_sa_digest(ESPState *esp, struct sk_buff *skb, int digest_n, int icv_n){
+ int err = 0;
+ u8 icv[icv_n];
+
+ if(DEBUG_ICV){
+ dprintf("> skb digest_n=%d icv_n=%d\n", digest_n, icv_n);
+ skb_print_bits(skb, 0, digest_n);
+ }
+ memset(icv, 0, icv_n);
+ esp->digest.icv(esp, skb, 0, digest_n, icv);
+ skb_put_bits(skb, digest_n, icv, icv_n);
+ return err;
+}
+
+/** Check the icv and trim it from the skb tail.
+ *
+ * @param sa sa state
+ * @param esp esp state
+ * @param esph esp header
+ * @param skb packet
+ * @return 0 on success, error code otherwise
+ */
+static int esp_check_icv(SAState *sa, ESPState *esp, ESPHdr *esph, struct sk_buff *skb){
+ int err = 0;
+ int icv_n = esp->digest.icv_n;
+ int digest_n = skb->len - icv_n;
+ u8 icv_skb[icv_n];
+ u8 icv_new[icv_n];
+
+ dprintf(">\n");
+ if(DEBUG_ICV){
+ dprintf("> skb len=%d digest_n=%d icv_n=%d\n",
+ skb->len, digest_n, icv_n);
+ skb_print_bits(skb, 0, skb->len);
+ }
+ if(skb_copy_bits(skb, digest_n, icv_skb, icv_n)){
+ wprintf("> Error getting icv from skb\n");
+ goto exit;
+ }
+ esp->digest.icv(esp, skb, 0, digest_n, icv_new);
+ if(DEBUG_ICV){
+ dprintf("> len=%d icv_n=%d", digest_n, icv_n);
+ printk("\nskb="); buf_print(icv_skb, icv_n);
+ printk("new="); buf_print(icv_new, icv_n);
+ }
+ if(unlikely(memcmp(icv_new, icv_skb, icv_n))){
+ wprintf("> ICV check failed!\n");
+ err = -EINVAL;
+ sa->counts.integrity_failures++;
+ goto exit;
+ }
+ skb_trim_tail(skb, icv_n);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Send a packet via an ESP SA.
+ *
+ * @param sa SA state
+ * @param skb packet to send
+ * @param tunnel underlying tunnel
+ * @return 0 on success, negative error code otherwise
+ */
+static int esp_sa_send(SAState *sa, struct sk_buff *skb, Tunnel *tunnel){
+ int err = 0;
+ int ip_n; // Size of ip header.
+ int plaintext_n; // Size of plaintext.
+ int ciphertext_n; // Size of ciphertext (including padding).
+ int extra_n; // Extra bytes needed for ciphertext.
+ int icv_n = 0; // Size of integrity check value (icv).
+ int iv_n = 0; // Size of initialization vector (iv).
+ int head_n; // Size of esp header and iv.
+ int tail_n; // Size of esp trailer: padding and icv.
+ ESPState *esp;
+ ESPHdr *esph;
+
+ dprintf(">\n");
+ esp = sa->data;
+ ip_n = (skb->nh.iph->ihl << 2);
+ // Assuming skb->data points at ethernet header, exclude ethernet
+ // header and IP header.
+ plaintext_n = skb->len - ETH_HLEN - ip_n;
+ // Add size of padding fields.
+ ciphertext_n = roundup(plaintext_n + ESP_PAD_N, esp->cipher.block_n);
+ if(esp->cipher.pad_n > 0){
+ ciphertext_n = roundup(ciphertext_n, esp->cipher.pad_n);
+ }
+ extra_n = ciphertext_n - plaintext_n;
+ iv_n = esp->cipher.iv_n;
+ icv_n = esp->digest.icv_n;
+ dprintf("> len=%d plaintext=%d ciphertext=%d extra=%d\n",
+ skb->len, plaintext_n, ciphertext_n, extra_n);
+ dprintf("> iv=%d icv=%d\n", iv_n, icv_n);
+ skb_print_bits(skb, 0, skb->len);
+
+ // Add headroom for esp header and iv, tailroom for the ciphertext
+ // and icv.
+ head_n = ESP_HDR_N + iv_n;
+ tail_n = extra_n + icv_n;
+ err = skb_make_room(&skb, skb, head_n, tail_n);
+ if(err) goto exit;
+ dprintf("> skb=%p\n", skb);
+ // Move the headers up to make space for the esp header. We can
+ // use memmove() since all this data fits in the skb head.
+ // todo: Can't assume this anymore?
+ dprintf("> header push...\n");
+ __skb_push(skb, head_n);
+ if(0 && skb->mac.raw){
+ dprintf("> skb->mac=%p\n", skb->mac.raw);
+ dprintf("> ETH header pull...\n");
+ memmove(skb->data, skb->mac.raw, ETH_HLEN);
+ skb->mac.raw = skb->data;
+ __skb_pull(skb, ETH_HLEN);
+ }
+ dprintf("> IP header pull...\n");
+ memmove(skb->data, skb->nh.raw, ip_n);
+ skb->nh.raw = skb->data;
+ __skb_pull(skb, ip_n);
+ esph = (void*)skb->data;
+ // Add spi and sequence number.
+ esph->spi = sa->ident.spi;
+ esph->seq = htonl(++sa->replay.send_seq);
+ // Insert the padding bytes: extra bytes less the pad fields
+ // themselves.
+ dprintf("> esp_sa_pad ...\n");
+ esp_sa_pad(skb, icv_n, extra_n, skb->nh.iph->protocol);
+ if(sa->security & SA_CONF){
+ dprintf("> esp_sa_encrypt...\n");
+ err = esp_sa_encrypt(esp, esph, skb, head_n, iv_n, ciphertext_n);
+ if(err) goto exit;
+ }
+ if(icv_n){
+ dprintf("> esp_sa_digest...\n");
+ err = esp_sa_digest(esp, skb, head_n + ciphertext_n, icv_n);
+ if(err) goto exit;
+ }
+ dprintf("> IP header push...\n");
+ __skb_push(skb, ip_n);
+ if(0 && skb->mac.raw){
+ dprintf("> ETH header push...\n");
+ __skb_push(skb, ETH_HLEN);
+ }
+ // Fix ip header. Adjust length field, set protocol, zero
+ // checksum.
+ {
+ // Total packet length (bytes).
+ int tot_len = ntohs(skb->nh.iph->tot_len);
+ tot_len += head_n;
+ tot_len += tail_n;
+ skb->nh.iph->protocol = IPPROTO_ESP;
+ skb->nh.iph->tot_len = htons(tot_len);
+ skb->nh.iph->check = 0;
+ }
+ err = Tunnel_send(tunnel, skb);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Release an skb context.
+ * Drops the refcount on the SA.
+ *
+ * @param context to free
+ */
+static void esp_context_free_fn(SkbContext *context){
+ SAState *sa;
+ if(!context) return;
+ sa = context->data;
+ if(!sa) return;
+ context->data = NULL;
+ SAState_decref(sa);
+}
+
+/** Receive a packet via an ESP SA.
+ * Does ESP receive processing (check icv, decrypt), strips
+ * ESP header and re-receives.
+ *
+ * @param sa SA
+ * @param skb packet
+ * @return 0 on success, negative error code otherwise
+ */
+static int esp_sa_recv(SAState *sa, struct sk_buff *skb){
+ int err = -EINVAL;
+ int mine = 0;
+ int vnet = 0; //todo: fixme - need to record skb vnet somewhere
+ ESPState *esp;
+ ESPHdr *esph;
+ ESPPadding *pad;
+ int block_n; // Cipher blocksize.
+ int icv_n; // Size of integrity check value (icv).
+ int iv_n; // Size of initialization vector (iv).
+ int text_n; // Size of text (ciphertext or plaintext).
+ int head_n; // Size of esp header and iv.
+
+ dprintf("> skb=%p\n", skb);
+ // Assumes skb->data points at esp hdr.
+ esph = (void*)skb->data;
+ esp = sa->data;
+ block_n = crypto_tfm_alg_blocksize(esp->cipher.tfm);
+ icv_n = esp->digest.icv_n;
+ iv_n = esp->cipher.iv_n;
+ head_n = ESP_HDR_N + iv_n;
+ text_n = skb->len - head_n - icv_n;
+ if(text_n < ESP_PAD_N || !multipleof(text_n, block_n)){
+ wprintf("> Invalid size: text_n=%d tfm:block_n=%d esp:block_n=%d\n",
+ text_n, block_n, esp->cipher.block_n);
+ goto exit;
+ }
+ if(icv_n){
+ err = esp_check_icv(sa, esp, esph, skb);
+ if(err) goto exit;
+ }
+ mine = 1;
+ if(sa->security & SA_CONF){
+ err = esp_sa_decrypt(esp, esph, skb, head_n, iv_n, text_n);
+ if(err) goto exit;
+ }
+ // Strip esp header by moving the other headers down.
+ //todo Maybe not safe to do this anymore.
+ memmove(skb->mac.raw + head_n, skb->mac.raw, (skb->data - skb->mac.raw));
+ skb->mac.raw += head_n;
+ skb->nh.raw += head_n;
+ // Move skb->data back to ethernet header.
+ // Do in 2 moves to ensure offsets are +ve,
+ // since args to skb_pull/skb_push are unsigned.
+ __skb_pull(skb, head_n);
+ __skb_push(skb, skb->data - skb->mac.raw);
+ // After this esph is invalid.
+ esph = NULL;
+ // Trim padding, restore protocol in IP header.
+ pad = skb_trim_tail(skb, ESP_PAD_N);
+ text_n -= ESP_PAD_N;
+ if((pad->pad_n > 255) | (pad->pad_n > text_n)){
+ wprintf("> Invalid padding: pad_n=%d text_n=%d\n", pad->pad_n, text_n);
+ goto exit;
+ }
+ skb_trim_tail(skb, pad->pad_n);
+ skb->nh.iph->protocol = pad->protocol;
+ err = skb_push_context(skb, vnet, sa->ident.addr, IPPROTO_ESP,
+ sa, esp_context_free_fn);
+ if(err) goto exit;
+ // Increase sa refcount now the skb context refers to it.
+ SAState_incref(sa);
+ err = netif_rx(skb);
+ exit:
+ if(mine) err = 1;
+ dprintf("< skb=%p err=%d\n", skb, err);
+ return err;
+}
+
+/** Estimate the packet size for some data using ESP processing.
+ *
+ * @param sa ESP SA
+ * @param data_n data size
+ * @return size after ESP processing
+ */
+static u32 esp_sa_size(SAState *sa, int data_n){
+ // Even in transport mode have to round up to blocksize.
+ // Have to add some padding for alignment even if pad_n is zero.
+ ESPState *esp = sa->data;
+
+ data_n = roundup(data_n + ESP_PAD_N, esp->cipher.block_n);
+ if(esp->cipher.pad_n > 0){
+ data_n = roundup(data_n, esp->cipher.pad_n);
+ }
+ data_n += esp->digest.icv_n;
+ //data_n += esp->cipher.iv_n;
+ data_n += ESP_HDR_N;
+ return data_n;
+}
+
+/** Compute an icv using HMAC digest.
+ *
+ * @param esp ESP state
+ * @param skb packet to digest
+ * @param offset offset to start at
+ * @param len number of bytes to digest
+ * @param icv return parameter for ICV
+ * @return 0 on success, negative error code otherwise
+ */
+static inline void esp_hmac_digest(ESPState *esp, struct sk_buff *skb,
+ int offset, int len, u8 *icv){
+ int err = 0;
+ struct crypto_tfm *digest = esp->digest.tfm;
+ char *icv_tmp = esp->digest.icv_tmp;
+ int sg_n = skb_shinfo(skb)->nr_frags + 1;
+ struct scatterlist sg[sg_n];
+
+ dprintf("> offset=%d len=%d\n", offset, len);
+ memset(icv, 0, esp->digest.icv_n);
+ if(DEBUG_ICV){
+ dprintf("> key len=%d\n", esp->digest.key_n);
+ printk("\nkey=");
+ buf_print(esp->digest.key,esp->digest.key_n);
+ }
+ crypto_hmac_init(digest, esp->digest.key, &esp->digest.key_n);
+ err = skb_scatterlist(skb, sg, &sg_n, offset, len);
+ crypto_hmac_update(digest, sg, sg_n);
+ crypto_hmac_final(digest, esp->digest.key, &esp->digest.key_n, icv_tmp);
+ if(DEBUG_ICV){
+ dprintf("> digest len=%d ", esp->digest.icv_n);
+ printk("\nval=");
+ buf_print(icv_tmp, esp->digest.icv_n);
+ }
+ memcpy(icv, icv_tmp, esp->digest.icv_n);
+ dprintf("<\n");
+}
+
+/** Finish up an esp state.
+ * Releases the digest, cipher, iv and frees the state.
+ *
+ * @parma esp state
+ */
+static void esp_fini(ESPState *esp){
+ if(!esp) return;
+ if(esp->digest.tfm){
+ crypto_free_tfm(esp->digest.tfm);
+ esp->digest.tfm = NULL;
+ }
+ if(esp->digest.icv_tmp){
+ kfree(esp->digest.icv_tmp);
+ esp->digest.icv_tmp = NULL;
+ }
+ if(esp->cipher.tfm){
+ crypto_free_tfm(esp->cipher.tfm);
+ esp->cipher.tfm = NULL;
+ }
+ if(esp->cipher.iv){
+ kfree(esp->cipher.iv);
+ esp->cipher.iv = NULL;
+ }
+ kfree(esp);
+}
+
+/** Release an ESP SA.
+ *
+ * @param sa ESO SA
+ */
+static void esp_sa_fini(SAState *sa){
+ ESPState *esp;
+ if(!sa) return;
+ esp = sa->data;
+ if(!esp) return;
+ esp_fini(esp);
+ sa->data = NULL;
+}
+
+/** Initialize the cipher for an ESP SA.
+ *
+ * @param sa ESP SA
+ * @param esp ESP state
+ * @return 0 on success, negative error code otherwise
+ */
+static int esp_cipher_init(SAState *sa, ESPState *esp){
+ int err = 0;
+ SAAlgorithm *algo = NULL;
+ int cipher_mode = CRYPTO_TFM_MODE_CBC;
+
+ dprintf("> sa=%p esp=%p\n", sa, esp);
+ dprintf("> cipher=%s\n", sa->cipher.name);
+ algo = sa_cipher_by_name(sa->cipher.name);
+ if(!algo){
+ wprintf("> Cipher unavailable: %s\n", sa->cipher.name);
+ err = -EINVAL;
+ goto exit;
+ }
+ esp->cipher.key_n = roundup(sa->cipher.bits, 8);
+ // If cipher is null must use ECB because CBC algo does not support blocksize 1.
+ if(strcmp(sa->cipher.name, "cipher_null")){
+ cipher_mode = CRYPTO_TFM_MODE_ECB;
+ }
+ esp->cipher.tfm = crypto_alloc_tfm(sa->cipher.name, cipher_mode);
+ if(!esp->cipher.tfm){
+ err = -ENOMEM;
+ goto exit;
+ }
+ esp->cipher.block_n = roundup(crypto_tfm_alg_blocksize(esp->cipher.tfm), 4);
+ esp->cipher.iv_n = crypto_tfm_alg_ivsize(esp->cipher.tfm);
+ esp->cipher.pad_n = 0;
+ if(esp->cipher.iv_n){
+ esp->cipher.iv = kmalloc(esp->cipher.iv_n, GFP_KERNEL);
+ get_random_bytes(esp->cipher.iv, esp->cipher.iv_n);
+ }
+ crypto_cipher_setkey(esp->cipher.tfm, esp->cipher.key, esp->cipher.key_n);
+ err = 0;
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Initialize the digest for an ESP SA.
+ *
+ * @param sa ESP SA
+ * @param esp ESP state
+ * @return 0 on success, negative error code otherwise
+ */
+static int esp_digest_init(SAState *sa, ESPState *esp){
+ int err = 0;
+ SAAlgorithm *algo = NULL;
+
+ dprintf(">\n");
+ esp->digest.key = sa->digest.key;
+ esp->digest.key_n = bits_to_bytes(roundup(sa->digest.bits, 8));
+ esp->digest.tfm = crypto_alloc_tfm(sa->digest.name, 0);
+ if(!esp->digest.tfm){
+ err = -ENOMEM;
+ goto exit;
+ }
+ algo = sa_digest_by_name(sa->digest.name);
+ if(!algo){
+ wprintf("> Digest unavailable: %s\n", sa->digest.name);
+ err = -EINVAL;
+ goto exit;
+ }
+ esp->digest.icv = esp_hmac_digest;
+ esp->digest.icv_full_n = bits_to_bytes(algo->info.digest.icv_fullbits);
+ esp->digest.icv_n = bits_to_bytes(algo->info.digest.icv_truncbits);
+
+ if(esp->digest.icv_full_n != crypto_tfm_alg_digestsize(esp->digest.tfm)){
+ err = -EINVAL;
+ wprintf("> digest %s, size %u != %hu\n",
+ sa->digest.name,
+ crypto_tfm_alg_digestsize(esp->digest.tfm),
+ esp->digest.icv_full_n);
+ goto exit;
+ }
+
+ esp->digest.icv_tmp = kmalloc(esp->digest.icv_full_n, GFP_KERNEL);
+ if(!esp->digest.icv_tmp){
+ err = -ENOMEM;
+ goto exit;
+ }
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Initialize an ESP SA.
+ *
+ * @param sa ESP SA
+ * @param args arguments
+ * @return 0 on success, negative error code otherwise
+ */
+static int esp_sa_init(SAState *sa, void *args){
+ int err = 0;
+ ESPState *esp = NULL;
+
+ dprintf("> sa=%p\n", sa);
+ esp = kmalloc(sizeof(*esp), GFP_KERNEL);
+ if(!esp){
+ err = -ENOMEM;
+ goto exit;
+ }
+ *esp = (ESPState){};
+ err = esp_cipher_init(sa, esp);
+ if(err) goto exit;
+ err = esp_digest_init(sa, esp);
+ if(err) goto exit;
+ sa->data = esp;
+ exit:
+ if(err){
+ if(esp) esp_fini(esp);
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** SA type for ESP.
+ */
+static SAType esp_sa_type = {
+ .name = "ESP",
+ .protocol = IPPROTO_ESP,
+ .init = esp_sa_init,
+ .fini = esp_sa_fini,
+ .size = esp_sa_size,
+ .recv = esp_sa_recv,
+ .send = esp_sa_send
+};
+
+/** Get the ESP header from a packet.
+ *
+ * @param skb packet
+ * @param esph return parameter for header
+ * @return 0 on success, negative error code otherwise
+ */
+static int esp_skb_header(struct sk_buff *skb, ESPHdr **esph){
+ int err = 0;
+ if(skb->len < ESP_HDR_N){
+ err = -EINVAL;
+ goto exit;
+ }
+ *esph = (ESPHdr*)skb->data;
+ exit:
+ return err;
+}
+
+/** Handle an incoming skb with ESP protocol.
+ *
+ * Lookup spi, if state found hand to the state.
+ * If no state, check spi, if ok, create state and pass to it.
+ * If spi not ok, drop.
+ *
+ * @param skb packet
+ * @return 0 on sucess, negative error code otherwise
+ */
+static int esp_protocol_recv(struct sk_buff *skb){
+ int err = 0;
+ const int eth_n = ETH_HLEN;
+ int ip_n;
+ ESPHdr *esph = NULL;
+ SAState *sa = NULL;
+ u32 addr;
+
+ dprintf(">\n");
+ dprintf("> recv skb=\n"); skb_print_bits(skb, 0, skb->len);
+ ip_n = (skb->nh.iph->ihl << 2);
+ if(skb->data == skb->mac.raw){
+ // skb->data points at ethernet header.
+ if (!pskb_may_pull(skb, eth_n + ip_n)){
+ wprintf("> Malformed skb\n");
+ err = -EINVAL;
+ goto exit;
+ }
+ skb_pull(skb, eth_n + ip_n);
+ }
+ addr = skb->nh.iph->daddr;
+ err = esp_skb_header(skb, &esph);
+ if(err) goto exit;
+ dprintf("> spi=%08x protocol=%d addr=" IPFMT "\n",
+ esph->spi, IPPROTO_ESP, NIPQUAD(addr));
+ sa = sa_table_lookup_spi(esph->spi, IPPROTO_ESP, addr);
+ if(!sa){
+ err = vnet_sa_create(esph->spi, IPPROTO_ESP, addr, &sa);
+ if(err) goto exit;
+ }
+ err = SAState_recv(sa, skb);
+ exit:
+ if(sa) SAState_decref(sa);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Handle an ICMP error related to ESP.
+ *
+ * @param skb ICMP error packet
+ * @param info
+ */
+static void esp_protocol_icmp_err(struct sk_buff *skb, u32 info){
+ struct iphdr *iph = (struct iphdr*)skb->data;
+ ESPHdr *esph;
+ SAState *sa;
+
+ dprintf("> ICMP error type=%d code=%d\n",
+ skb->h.icmph->type, skb->h.icmph->code);
+ if(skb->h.icmph->type != ICMP_DEST_UNREACH ||
+ skb->h.icmph->code != ICMP_FRAG_NEEDED){
+ return;
+ }
+
+ //todo: need to check skb has enough len to do this.
+ esph = (ESPHdr*)(skb->data + (iph->ihl << 2));
+ sa = sa_table_lookup_spi(esph->spi, IPPROTO_ESP, iph->daddr);
+ if(!sa) return;
+ wprintf("> ICMP unreachable on SA ESP spi=%08x addr=" IPFMT "\n",
+ ntohl(esph->spi), NIPQUAD(iph->daddr));
+ SAState_decref(sa);
+}
+
+//============================================================================
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+// Code for 2.6 kernel.
+
+/** Protocol handler for ESP.
+ */
+static struct net_protocol esp_protocol = {
+ .handler = esp_protocol_recv,
+ .err_handler = esp_protocol_icmp_err
+};
+
+static int esp_protocol_add(void){
+ return inet_add_protocol(&esp_protocol, IPPROTO_ESP);
+}
+
+static int esp_protocol_del(void){
+ return inet_del_protocol(&esp_protocol, IPPROTO_ESP);
+}
+
+//============================================================================
+#else
+//============================================================================
+// Code for 2.4 kernel.
+
+/** Protocol handler for ESP.
+ */
+static struct inet_protocol esp_protocol = {
+ .name = "ESP",
+ .protocol = IPPROTO_ESP,
+ .handler = esp_protocol_recv,
+ .err_handler = esp_protocol_icmp_err
+};
+
+static int esp_protocol_add(void){
+ inet_add_protocol(&esp_protocol);
+ return 0;
+}
+
+static int esp_protocol_del(void){
+ return inet_del_protocol(&esp_protocol);
+}
+
+#endif
+//============================================================================
+
+
+/** Initialize the ESP module.
+ * Registers the ESP protocol and SA type.
+ *
+ * @return 0 on success, negative error code otherwise
+ */
+int __init esp_module_init(void){
+ int err = 0;
+ dprintf(">\n");
+ err = SAType_add(&esp_sa_type);
+ if(err < 0){
+ eprintf("> Error adding esp sa type\n");
+ goto exit;
+ }
+ esp_protocol_add();
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Finalize the ESP module.
+ * Deregisters the ESP protocol and SA type.
+ */
+void __exit esp_module_exit(void){
+ if(esp_protocol_del() < 0){
+ eprintf("> Error removing esp protocol\n");
+ }
+ if(SAType_del(&esp_sa_type) < 0){
+ eprintf("> Error removing esp sa type\n");
+ }
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef __VNET_ESP_H__
+#define __VNET_ESP_H__
+
+#include <linux/config.h>
+#include <linux/types.h>
+#include <linux/crypto.h>
+
+/** Header used by IPSEC ESP (Encapsulated Security Payload). */
+typedef struct ESPHdr {
+ /** The spi (security parameters index). */
+ u32 spi;
+ /** Sequence number. */
+ u32 seq;
+ /* Variable length data (depends on crypto suite).
+ Mind the 64 bit alignment! */
+ u8 data[0];
+} ESPHdr;
+
+/** Padding trailer used by IPSEC ESP.
+ * Follows the padding itself with the padding length and the
+ * protocol being encapsulated.
+ */
+typedef struct ESPPadding {
+ u8 pad_n;
+ u8 protocol;
+} ESPPadding;
+
+/** Size of the esp header (spi and seq). */
+static const int ESP_HDR_N = sizeof(ESPHdr);
+
+/** Size of the esp pad and next protocol field. */
+static const int ESP_PAD_N = sizeof(ESPPadding);
+
+enum {
+ SASTATE_VOID,
+ SASTATE_ACQUIRE,
+ SASTATE_VALID,
+ SASTATE_ERROR,
+ SASTATE_EXPIRED,
+ SASTATE_DEAD,
+};
+
+struct ESPState;
+
+/** A cipher instance. */
+typedef struct ESPCipher {
+ /** Cipher key. */
+ u8 *key;
+ /** Key size (bytes). */
+ int key_n;
+ /** Initialization vector (IV). */
+ u8 *iv;
+ /** IV size (bytes). */
+ int iv_n;
+ /** Block size for padding (bytes). */
+ int pad_n;
+ /** Cipher block size (bytes). */
+ int block_n;
+ /** Cipher crypto transform. */
+ struct crypto_tfm *tfm;
+} ESPCipher;
+
+/** A digest instance. */
+typedef struct ESPDigest {
+ /** Digest key. */
+ u8 *key;
+ /** Key size (bytes) */
+ int key_n;
+ /** ICV size used (bytes). */
+ u8 icv_n;
+ /** Full ICV size when computed (bytes). */
+ u8 icv_full_n;
+ /** Working storage for computing ICV. */
+ u8 *icv_tmp;
+ /** Function used to compute ICV (e.g. HMAC). */
+ void (*icv)(struct ESPState *esp,
+ struct sk_buff *skb,
+ int offset,
+ int len,
+ u8 *icv);
+ /** Digest crypto transform (e.g. SHA). */
+ struct crypto_tfm *tfm;
+} ESPDigest;
+
+typedef struct ESPState {
+ struct ESPCipher cipher;
+ struct ESPDigest digest;
+} ESPState;
+
+extern int esp_module_init(void);
+extern void esp_module_exit(void);
+
+#endif /* !__VNET_ESP_H__ */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+
+#include <linux/version.h>
+
+#include <linux/skbuff.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/icmp.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <net/checksum.h>
+
+#include <etherip.h>
+#include <tunnel.h>
+#include <vnet.h>
+#include <varp.h>
+#include <if_varp.h>
+#include <skb_util.h>
+
+#define MODULE_NAME "VNET"
+//#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+/** @file Etherip implementation.
+ * The etherip protocol is used to transport Ethernet frames in IP packets.
+ */
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define MAC_ETH(_skb) ((struct ethhdr *)(_skb)->mac.raw)
+#else
+#define MAC_ETH(_skb) ((_skb)->mac.ethernet)
+#endif
+
+/** Get the vnet label from an etherip header.
+ *
+ * @param hdr header
+ * @return vnet (in host order)
+ */
+int etheriphdr_get_vnet(struct etheriphdr *hdr){
+#ifdef CONFIG_ETHERIP_EXT
+ return ntohl(hdr->vnet);
+#else
+ return hdr->reserved;
+#endif
+}
+
+/** Set the vnet label in an etherip header.
+ * Also sets the etherip version.
+ *
+ * @param hdr header
+ * @param vnet vnet label (in host order)
+ */
+void etheriphdr_set_vnet(struct etheriphdr *hdr, int vnet){
+#ifdef CONFIG_ETHERIP_EXT
+ hdr->version = 4;
+ hdr->vnet = htonl(vnet);
+#else
+ hdr->version = 3;
+ hdr->reserved = vnet & 0x0fff;
+#endif
+}
+
+/** Open an etherip tunnel.
+ *
+ * @param tunnel to open
+ * @return 0 on success, error code otherwise
+ */
+static int etherip_tunnel_open(Tunnel *tunnel){
+ return 0;
+}
+
+/** Close an etherip tunnel.
+ *
+ * @param tunnel to close
+ */
+static void etherip_tunnel_close(Tunnel *tunnel){
+}
+
+
+/** Send a packet via an etherip tunnel.
+ * Adds etherip header, new ip header, new ethernet header around
+ * ethernet frame.
+ *
+ * @param tunnel tunnel
+ * @param skb packet
+ * @return 0 on success, error code otherwise
+ */
+static int etherip_tunnel_send(Tunnel *tunnel, struct sk_buff *skb){
+ int err = 0;
+ const int etherip_n = sizeof(struct etheriphdr);
+ const int ip_n = sizeof(struct iphdr);
+ const int eth_n = ETH_HLEN;
+ int head_n = 0;
+ int vnet = tunnel->key.vnet;
+ struct etheriphdr *etheriph;
+ struct ethhdr *ethh;
+ u32 saddr = 0;
+
+ dprintf("> skb=%p vnet=%d\n", skb, vnet);
+ head_n = etherip_n + ip_n + eth_n;
+ err = skb_make_room(&skb, skb, head_n, 0);
+ if(err) goto exit;
+
+ //err = vnet_get_device_address(skb->dev, &saddr);
+ //if(err) goto exit;
+
+ // The original ethernet header.
+ ethh = MAC_ETH(skb);
+ //print_skb_data(__FUNCTION__, 0, skb, skb->mac.raw, skb->len);
+ // Null the pointer as we are pushing a new IP header.
+ skb->mac.raw = NULL;
+
+ // Setup the etherip header.
+ //dprintf("> push etherip header...\n");
+ etheriph = (struct etheriphdr *)skb_push(skb, etherip_n);
+ etheriphdr_set_vnet(etheriph, vnet);
+
+ // Setup the IP header.
+ //dprintf("> push IP header...\n");
+ skb->nh.raw = skb_push(skb, ip_n);
+ skb->nh.iph->version = 4; // Standard version.
+ skb->nh.iph->ihl = ip_n / 4; // IP header length (32-bit words).
+ skb->nh.iph->tos = 0; // No special type-of-service.
+ skb->nh.iph->tot_len = htons(skb->len); // Total packet length (bytes).
+ skb->nh.iph->id = 0; // No flow id (since no frags).
+ skb->nh.iph->frag_off = htons(IP_DF); // Don't fragment - can't handle frags.
+ skb->nh.iph->ttl = 64; // Linux default time-to-live.
+ skb->nh.iph->protocol = IPPROTO_ETHERIP; // IP protocol number.
+ skb->nh.iph->saddr = saddr; // Source address.
+ skb->nh.iph->daddr = tunnel->key.addr; // Destination address.
+ skb->nh.iph->check = 0;
+
+ // Ethernet header will be filled-in by device.
+ err = Tunnel_send(tunnel->base, skb);
+ skb = NULL;
+ exit:
+ if(err && skb) dev_kfree_skb(skb);
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Tunnel type for etherip.
+ */
+static TunnelType _etherip_tunnel_type = {
+ .name = "ETHERIP",
+ .open = etherip_tunnel_open,
+ .close = etherip_tunnel_close,
+ .send = etherip_tunnel_send
+};
+
+TunnelType *etherip_tunnel_type = &_etherip_tunnel_type;
+
+/* Defeat compiler warnings about unused functions. */
+static void print_str(char *s, int n) __attribute__((unused));
+
+static void print_str(char *s, int n) {
+ int i;
+
+ for(i=0; i<n; s++, i++){
+ if(i && i % 40 == 0) printk("\n");
+ if(('a'<= *s && *s <= 'z') ||
+ ('A'<= *s && *s <= 'Z') ||
+ ('0'<= *s && *s <= '9')){
+ printk("%c", *s);
+ } else {
+ printk("<%x>", (unsigned)(0xff & *s));
+ }
+ }
+ printk("\n");
+}
+
+/** Do etherip receive processing.
+ * Strips etherip header to extract the ethernet frame, sets
+ * the vnet from the header and re-receives the frame.
+ *
+ * @param skb packet
+ * @return 0 on success, error code otherwise
+ */
+static int etherip_protocol_recv(struct sk_buff *skb){
+ int err = 0;
+ int mine = 0;
+ const int eth_n = ETH_HLEN;
+ int ip_n;
+ const int etherip_n = sizeof(struct etheriphdr);
+ struct etheriphdr *etheriph;
+ struct ethhdr *ethhdr;
+ Vnet *vinfo = NULL;
+ u32 vnet;
+
+ ethhdr = MAC_ETH(skb);
+ if(MULTICAST(skb->nh.iph->daddr) &&
+ (skb->nh.iph->daddr != varp_mcast_addr)){
+ // Ignore multicast packets not addressed to us.
+ dprintf("> dst=%u.%u.%u.%u varp_mcast_addr=%u.%u.%u.%u\n",
+ NIPQUAD(skb->nh.iph->daddr),
+ NIPQUAD(varp_mcast_addr));
+ goto exit;
+ }
+ ip_n = (skb->nh.iph->ihl << 2);
+ if(skb->data == skb->mac.raw){
+ // skb->data points at ethernet header.
+ //dprintf("> len=%d\n", skb->len);
+ if (!pskb_may_pull(skb, eth_n + ip_n)){
+ wprintf("> Malformed skb\n");
+ err = -EINVAL;
+ goto exit;
+ }
+ skb_pull(skb, eth_n + ip_n);
+ }
+ // Assume skb->data points at etherip header.
+ etheriph = (void*)skb->data;
+ if(!pskb_may_pull(skb, etherip_n)){
+ wprintf("> Malformed skb\n");
+ err = -EINVAL;
+ goto exit;
+ }
+ vnet = etheriphdr_get_vnet(etheriph);
+ dprintf("> Rcvd skb=%p vnet=%d\n", skb, vnet);
+ // If vnet is secure, context must include IPSEC ESP.
+ err = vnet_check_context(vnet, SKB_CONTEXT(skb), &vinfo);
+ Vnet_decref(vinfo);
+ if(err){
+ wprintf("> Failed security check\n");
+ goto exit;
+ }
+ mine = 1;
+ // Point at the headers in the contained ethernet frame.
+ skb->mac.raw = skb_pull(skb, etherip_n);
+
+ // Know source ip, vnet, vmac, so could update varp cache.
+ // But if traffic comes to us over a vnetd tunnel this points the coa
+ // at the vnetd rather than the endpoint. So don't do it.
+ //varp_update(htonl(vnet), MAC_ETH(skb)->h_source, skb->nh.iph->saddr);
+
+ // Assuming a standard Ethernet frame.
+ skb->nh.raw = skb_pull(skb, ETH_HLEN);
+
+#ifdef CONFIG_NETFILTER
+#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
+ // This stops our new pkt header being clobbered by a subsequent
+ // call to nf_bridge_maybe_copy_header. Just replicate the
+ // corresponding nf_bridge_save_header.
+ if(skb->nf_bridge){
+ int header_size = 16;
+ if(MAC_ETH(skb)->h_proto == __constant_htons(ETH_P_8021Q)) {
+ header_size = 18;
+ }
+ memcpy(skb->nf_bridge->data, skb->data - header_size, header_size);
+ }
+#endif
+#endif
+
+ if(1){
+ struct ethhdr *eth = MAC_ETH(skb);
+ // Devices use eth_type_trans() to set skb->pkt_type and skb->protocol.
+ // Set them from contained ethhdr, or leave as received?
+ // 'Ware use of hard_header_len in eth_type_trans().
+
+ //skb->protocol = htons(ETH_P_IP);
+
+ if(ntohs(eth->h_proto) >= 1536){
+ skb->protocol = eth->h_proto;
+ } else {
+ skb->protocol = htons(ETH_P_802_2);
+ }
+
+ if(mac_is_multicast(eth->h_dest)){
+ if(mac_is_broadcast(eth->h_dest)){
+ skb->pkt_type = PACKET_BROADCAST;
+ } else {
+ skb->pkt_type = PACKET_MULTICAST;
+ }
+ } else {
+ skb->pkt_type = PACKET_HOST;
+ }
+
+ memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+ if (skb->ip_summed == CHECKSUM_HW){
+ skb->ip_summed = CHECKSUM_NONE;
+ //skb->csum = csum_sub(skb->csum,
+ // csum_partial(skb->mac.raw, skb->nh.raw - skb->mac.raw, 0));
+ }
+ dst_release(skb->dst);
+ skb->dst = NULL;
+#ifdef CONFIG_NETFILTER
+ nf_conntrack_put(skb->nfct);
+ skb->nfct = NULL;
+#ifdef CONFIG_NETFILTER_DEBUG
+ skb->nf_debug = 0;
+#endif
+#endif
+ }
+
+ //print_skb_data(__FUNCTION__, 0, skb, skb->mac.raw, skb->len + ETH_HLEN);
+
+ err = vnet_skb_recv(skb, vnet, (Vmac*)MAC_ETH(skb)->h_dest);
+ exit:
+ if(mine) err = 1;
+ dprintf("< skb=%p err=%d\n", skb, err);
+ return err;
+}
+
+/** Handle an ICMP error related to etherip.
+ *
+ * @param skb ICMP error packet
+ * @param info
+ */
+static void etherip_protocol_icmp_err(struct sk_buff *skb, u32 info){
+ struct iphdr *iph = (struct iphdr*)skb->data;
+
+ wprintf("> ICMP error type=%d code=%d addr=" IPFMT "\n",
+ skb->h.icmph->type, skb->h.icmph->code, NIPQUAD(iph->daddr));
+
+ if (skb->h.icmph->type != ICMP_DEST_UNREACH ||
+ skb->h.icmph->code != ICMP_FRAG_NEEDED){
+ return;
+ }
+ wprintf("> MTU too big addr= " IPFMT "\n", NIPQUAD(iph->daddr));
+}
+
+//============================================================================
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+// Code for 2.6 kernel.
+
+/** Etherip protocol. */
+static struct net_protocol etherip_protocol = {
+ .handler = etherip_protocol_recv,
+ .err_handler = etherip_protocol_icmp_err,
+};
+
+static int etherip_protocol_add(void){
+ return inet_add_protocol(ðerip_protocol, IPPROTO_ETHERIP);
+}
+
+static int etherip_protocol_del(void){
+ return inet_del_protocol(ðerip_protocol, IPPROTO_ETHERIP);
+}
+
+//============================================================================
+#else
+//============================================================================
+// Code for 2.4 kernel.
+
+/** Etherip protocol. */
+static struct inet_protocol etherip_protocol = {
+ .name = "ETHERIP",
+ .protocol = IPPROTO_ETHERIP,
+ .handler = etherip_protocol_recv,
+ .err_handler = etherip_protocol_icmp_err,
+};
+
+static int etherip_protocol_add(void){
+ inet_add_protocol(ðerip_protocol);
+ return 0;
+}
+
+static int etherip_protocol_del(void){
+ return inet_del_protocol(ðerip_protocol);
+}
+
+#endif
+//============================================================================
+
+
+/** Initialize the etherip module.
+ * Registers the etherip protocol.
+ *
+ * @return 0 on success, error code otherwise
+ */
+int __init etherip_module_init(void) {
+ int err = 0;
+ etherip_protocol_add();
+ return err;
+}
+
+/** Finalize the etherip module.
+ * Deregisters the etherip protocol.
+ */
+void __exit etherip_module_exit(void) {
+ if(etherip_protocol_del() < 0){
+ printk(KERN_INFO "%s: can't remove etherip protocol\n", __FUNCTION__);
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef _VNET_ETHERIP_H_
+#define _VNET_ETHERIP_H_
+
+#include "if_etherip.h"
+
+extern int etherip_module_init(void);
+extern void etherip_module_exit(void);
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef _VNET_IF_ETHERIP_H_
+#define _VNET_IF_ETHERIP_H_
+/*----------------------------------------------------------------------------*/
+#ifdef CONFIG_ETHERIP_EXT
+struct etheriphdr {
+ __u8 version;
+ __u32 vnet;
+} __attribute__ ((packed));
+
+/*----------------------------------------------------------------------------*/
+#else
+struct etheriphdr
+{
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u16 reserved:12,
+ version:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u16 version:4,
+ reserved:12;
+#else
+#error "Please fix <asm/byteorder.h>"
+#endif
+
+};
+#endif
+
+#ifndef IPPROTO_ETHERIP
+#define IPPROTO_ETHERIP 97
+#endif
+
+/*----------------------------------------------------------------------------*/
+
+#endif /* ! _VNET_IF_ETHERIP_H_ */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _VNET_IF_VARP_H
+#define _VNET_IF_VARP_H
+
+typedef struct Vmac {
+ unsigned char mac[ETH_ALEN];
+} Vmac;
+
+enum {
+ VARP_ID = 1,
+ VARP_OP_REQUEST = 1,
+ VARP_OP_ANNOUNCE = 2,
+};
+
+typedef struct VnetMsgHdr {
+ uint16_t id;
+ uint16_t opcode;
+} __attribute__((packed)) VnetMsgHdr;
+
+typedef struct VarpHdr {
+ VnetMsgHdr;
+ uint32_t vnet;
+ Vmac vmac;
+ uint32_t addr;
+} __attribute__((packed)) VarpHdr;
+
+/** Default address for varp/vnet broadcasts: 224.10.0.1 */
+#define VARP_MCAST_ADDR 0xe00a0001
+
+/** UDP port to use for varp protocol. */
+#define VARP_PORT 1798
+
+
+
+#endif /* ! _VNET_IF_VARP_H */
--- /dev/null
+/* PF_KEY user interface, this is defined by rfc2367 so
+ * do not make arbitrary modifications or else this header
+ * file will not be compliant.
+ */
+
+#ifndef _LINUX_PFKEY2_H
+#define _LINUX_PFKEY2_H
+
+#include <linux/types.h>
+
+#define PF_KEY_V2 2
+#define PFKEYV2_REVISION 199806L
+
+struct sadb_msg {
+ uint8_t sadb_msg_version;
+ uint8_t sadb_msg_type;
+ uint8_t sadb_msg_errno;
+ uint8_t sadb_msg_satype;
+ uint16_t sadb_msg_len;
+ uint16_t sadb_msg_reserved;
+ uint32_t sadb_msg_seq;
+ uint32_t sadb_msg_pid;
+} __attribute__((packed));
+/* sizeof(struct sadb_msg) == 16 */
+
+struct sadb_ext {
+ uint16_t sadb_ext_len;
+ uint16_t sadb_ext_type;
+} __attribute__((packed));
+/* sizeof(struct sadb_ext) == 4 */
+
+struct sadb_sa {
+ uint16_t sadb_sa_len;
+ uint16_t sadb_sa_exttype;
+ uint32_t sadb_sa_spi;
+ uint8_t sadb_sa_replay;
+ uint8_t sadb_sa_state;
+ uint8_t sadb_sa_auth;
+ uint8_t sadb_sa_encrypt;
+ uint32_t sadb_sa_flags;
+} __attribute__((packed));
+/* sizeof(struct sadb_sa) == 16 */
+
+struct sadb_lifetime {
+ uint16_t sadb_lifetime_len;
+ uint16_t sadb_lifetime_exttype;
+ uint32_t sadb_lifetime_allocations;
+ uint64_t sadb_lifetime_bytes;
+ uint64_t sadb_lifetime_addtime;
+ uint64_t sadb_lifetime_usetime;
+} __attribute__((packed));
+/* sizeof(struct sadb_lifetime) == 32 */
+
+struct sadb_address {
+ uint16_t sadb_address_len;
+ uint16_t sadb_address_exttype;
+ uint8_t sadb_address_proto;
+ uint8_t sadb_address_prefixlen;
+ uint16_t sadb_address_reserved;
+} __attribute__((packed));
+/* sizeof(struct sadb_address) == 8 */
+
+struct sadb_key {
+ uint16_t sadb_key_len;
+ uint16_t sadb_key_exttype;
+ uint16_t sadb_key_bits;
+ uint16_t sadb_key_reserved;
+} __attribute__((packed));
+/* sizeof(struct sadb_key) == 8 */
+
+struct sadb_ident {
+ uint16_t sadb_ident_len;
+ uint16_t sadb_ident_exttype;
+ uint16_t sadb_ident_type;
+ uint16_t sadb_ident_reserved;
+ uint64_t sadb_ident_id;
+} __attribute__((packed));
+/* sizeof(struct sadb_ident) == 16 */
+
+struct sadb_sens {
+ uint16_t sadb_sens_len;
+ uint16_t sadb_sens_exttype;
+ uint32_t sadb_sens_dpd;
+ uint8_t sadb_sens_sens_level;
+ uint8_t sadb_sens_sens_len;
+ uint8_t sadb_sens_integ_level;
+ uint8_t sadb_sens_integ_len;
+ uint32_t sadb_sens_reserved;
+} __attribute__((packed));
+/* sizeof(struct sadb_sens) == 16 */
+
+/* followed by:
+ uint64_t sadb_sens_bitmap[sens_len];
+ uint64_t sadb_integ_bitmap[integ_len]; */
+
+struct sadb_prop {
+ uint16_t sadb_prop_len;
+ uint16_t sadb_prop_exttype;
+ uint8_t sadb_prop_replay;
+ uint8_t sadb_prop_reserved[3];
+} __attribute__((packed));
+/* sizeof(struct sadb_prop) == 8 */
+
+/* followed by:
+ struct sadb_comb sadb_combs[(sadb_prop_len +
+ sizeof(uint64_t) - sizeof(struct sadb_prop)) /
+ sizeof(strut sadb_comb)]; */
+
+struct sadb_comb {
+ uint8_t sadb_comb_auth;
+ uint8_t sadb_comb_encrypt;
+ uint16_t sadb_comb_flags;
+ uint16_t sadb_comb_auth_minbits;
+ uint16_t sadb_comb_auth_maxbits;
+ uint16_t sadb_comb_encrypt_minbits;
+ uint16_t sadb_comb_encrypt_maxbits;
+ uint32_t sadb_comb_reserved;
+ uint32_t sadb_comb_soft_allocations;
+ uint32_t sadb_comb_hard_allocations;
+ uint64_t sadb_comb_soft_bytes;
+ uint64_t sadb_comb_hard_bytes;
+ uint64_t sadb_comb_soft_addtime;
+ uint64_t sadb_comb_hard_addtime;
+ uint64_t sadb_comb_soft_usetime;
+ uint64_t sadb_comb_hard_usetime;
+} __attribute__((packed));
+/* sizeof(struct sadb_comb) == 72 */
+
+struct sadb_supported {
+ uint16_t sadb_supported_len;
+ uint16_t sadb_supported_exttype;
+ uint32_t sadb_supported_reserved;
+} __attribute__((packed));
+/* sizeof(struct sadb_supported) == 8 */
+
+/* followed by:
+ struct sadb_alg sadb_algs[(sadb_supported_len +
+ sizeof(uint64_t) - sizeof(struct sadb_supported)) /
+ sizeof(struct sadb_alg)]; */
+
+struct sadb_alg {
+ uint8_t sadb_alg_id;
+ uint8_t sadb_alg_ivlen;
+ uint16_t sadb_alg_minbits;
+ uint16_t sadb_alg_maxbits;
+ uint16_t sadb_alg_reserved;
+} __attribute__((packed));
+/* sizeof(struct sadb_alg) == 8 */
+
+struct sadb_spirange {
+ uint16_t sadb_spirange_len;
+ uint16_t sadb_spirange_exttype;
+ uint32_t sadb_spirange_min;
+ uint32_t sadb_spirange_max;
+ uint32_t sadb_spirange_reserved;
+} __attribute__((packed));
+/* sizeof(struct sadb_spirange) == 16 */
+
+struct sadb_x_kmprivate {
+ uint16_t sadb_x_kmprivate_len;
+ uint16_t sadb_x_kmprivate_exttype;
+ u_int32_t sadb_x_kmprivate_reserved;
+} __attribute__((packed));
+/* sizeof(struct sadb_x_kmprivate) == 8 */
+
+struct sadb_x_sa2 {
+ uint16_t sadb_x_sa2_len;
+ uint16_t sadb_x_sa2_exttype;
+ uint8_t sadb_x_sa2_mode;
+ uint8_t sadb_x_sa2_reserved1;
+ uint16_t sadb_x_sa2_reserved2;
+ uint32_t sadb_x_sa2_sequence;
+ uint32_t sadb_x_sa2_reqid;
+} __attribute__((packed));
+/* sizeof(struct sadb_x_sa2) == 16 */
+
+struct sadb_x_policy {
+ uint16_t sadb_x_policy_len;
+ uint16_t sadb_x_policy_exttype;
+ uint16_t sadb_x_policy_type;
+ uint8_t sadb_x_policy_dir;
+ uint8_t sadb_x_policy_reserved;
+ uint32_t sadb_x_policy_id;
+ uint32_t sadb_x_policy_reserved2;
+} __attribute__((packed));
+/* sizeof(struct sadb_x_policy) == 16 */
+
+struct sadb_x_ipsecrequest {
+ uint16_t sadb_x_ipsecrequest_len;
+ uint16_t sadb_x_ipsecrequest_proto;
+ uint8_t sadb_x_ipsecrequest_mode;
+ uint8_t sadb_x_ipsecrequest_level;
+ uint16_t sadb_x_ipsecrequest_reqid;
+} __attribute__((packed));
+/* sizeof(struct sadb_x_ipsecrequest) == 16 */
+
+/* This defines the TYPE of Nat Traversal in use. Currently only one
+ * type of NAT-T is supported, draft-ietf-ipsec-udp-encaps-06
+ */
+struct sadb_x_nat_t_type {
+ uint16_t sadb_x_nat_t_type_len;
+ uint16_t sadb_x_nat_t_type_exttype;
+ uint8_t sadb_x_nat_t_type_type;
+ uint8_t sadb_x_nat_t_type_reserved[3];
+} __attribute__((packed));
+/* sizeof(struct sadb_x_nat_t_type) == 8 */
+
+/* Pass a NAT Traversal port (Source or Dest port) */
+struct sadb_x_nat_t_port {
+ uint16_t sadb_x_nat_t_port_len;
+ uint16_t sadb_x_nat_t_port_exttype;
+ uint16_t sadb_x_nat_t_port_port;
+ uint16_t sadb_x_nat_t_port_reserved;
+} __attribute__((packed));
+/* sizeof(struct sadb_x_nat_t_port) == 8 */
+
+/* Message types */
+#define SADB_RESERVED 0
+#define SADB_GETSPI 1
+#define SADB_UPDATE 2
+#define SADB_ADD 3
+#define SADB_DELETE 4
+#define SADB_GET 5
+#define SADB_ACQUIRE 6
+#define SADB_REGISTER 7
+#define SADB_EXPIRE 8
+#define SADB_FLUSH 9
+#define SADB_DUMP 10
+#define SADB_X_PROMISC 11
+#define SADB_X_PCHANGE 12
+#define SADB_X_SPDUPDATE 13
+#define SADB_X_SPDADD 14
+#define SADB_X_SPDDELETE 15
+#define SADB_X_SPDGET 16
+#define SADB_X_SPDACQUIRE 17
+#define SADB_X_SPDDUMP 18
+#define SADB_X_SPDFLUSH 19
+#define SADB_X_SPDSETIDX 20
+#define SADB_X_SPDEXPIRE 21
+#define SADB_X_SPDDELETE2 22
+#define SADB_X_NAT_T_NEW_MAPPING 23
+#define SADB_MAX 23
+
+/* Security Association flags */
+#define SADB_SAFLAGS_PFS 1
+
+/* Security Association states */
+#define SADB_SASTATE_LARVAL 0
+#define SADB_SASTATE_MATURE 1
+#define SADB_SASTATE_DYING 2
+#define SADB_SASTATE_DEAD 3
+#define SADB_SASTATE_MAX 3
+
+/* Security Association types */
+#define SADB_SATYPE_UNSPEC 0
+#define SADB_SATYPE_AH 2
+#define SADB_SATYPE_ESP 3
+#define SADB_SATYPE_RSVP 5
+#define SADB_SATYPE_OSPFV2 6
+#define SADB_SATYPE_RIPV2 7
+#define SADB_SATYPE_MIP 8
+#define SADB_X_SATYPE_IPCOMP 9
+#define SADB_SATYPE_MAX 9
+
+/* Authentication algorithms */
+#define SADB_AALG_NONE 0
+#define SADB_AALG_MD5HMAC 2
+#define SADB_AALG_SHA1HMAC 3
+#define SADB_X_AALG_SHA2_256HMAC 5
+#define SADB_X_AALG_SHA2_384HMAC 6
+#define SADB_X_AALG_SHA2_512HMAC 7
+#define SADB_X_AALG_RIPEMD160HMAC 8
+#define SADB_X_AALG_NULL 251 /* kame */
+#define SADB_AALG_MAX 251
+
+/* Encryption algorithms */
+#define SADB_EALG_NONE 0
+#define SADB_EALG_DESCBC 2
+#define SADB_EALG_3DESCBC 3
+#define SADB_X_EALG_CASTCBC 6
+#define SADB_X_EALG_BLOWFISHCBC 7
+#define SADB_EALG_NULL 11
+#define SADB_X_EALG_AESCBC 12
+#define SADB_EALG_MAX 12
+
+/* Compression algorithms */
+#define SADB_X_CALG_NONE 0
+#define SADB_X_CALG_OUI 1
+#define SADB_X_CALG_DEFLATE 2
+#define SADB_X_CALG_LZS 3
+#define SADB_X_CALG_LZJH 4
+#define SADB_X_CALG_MAX 4
+
+/* Extension Header values */
+#define SADB_EXT_RESERVED 0
+#define SADB_EXT_SA 1
+#define SADB_EXT_LIFETIME_CURRENT 2
+#define SADB_EXT_LIFETIME_HARD 3
+#define SADB_EXT_LIFETIME_SOFT 4
+#define SADB_EXT_ADDRESS_SRC 5
+#define SADB_EXT_ADDRESS_DST 6
+#define SADB_EXT_ADDRESS_PROXY 7
+#define SADB_EXT_KEY_AUTH 8
+#define SADB_EXT_KEY_ENCRYPT 9
+#define SADB_EXT_IDENTITY_SRC 10
+#define SADB_EXT_IDENTITY_DST 11
+#define SADB_EXT_SENSITIVITY 12
+#define SADB_EXT_PROPOSAL 13
+#define SADB_EXT_SUPPORTED_AUTH 14
+#define SADB_EXT_SUPPORTED_ENCRYPT 15
+#define SADB_EXT_SPIRANGE 16
+#define SADB_X_EXT_KMPRIVATE 17
+#define SADB_X_EXT_POLICY 18
+#define SADB_X_EXT_SA2 19
+/* The next four entries are for setting up NAT Traversal */
+#define SADB_X_EXT_NAT_T_TYPE 20
+#define SADB_X_EXT_NAT_T_SPORT 21
+#define SADB_X_EXT_NAT_T_DPORT 22
+#define SADB_X_EXT_NAT_T_OA 23
+#define SADB_EXT_MAX 23
+
+/* Identity Extension values */
+#define SADB_IDENTTYPE_RESERVED 0
+#define SADB_IDENTTYPE_PREFIX 1
+#define SADB_IDENTTYPE_FQDN 2
+#define SADB_IDENTTYPE_USERFQDN 3
+#define SADB_IDENTTYPE_MAX 3
+
+#endif /* !(_LINUX_PFKEY2_H) */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+
+#include "hash_table.h"
+
+#define MODULE_NAME "RANDOM"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+/** @file
+ * Source of randomness.
+ * Current implementation is not enough.
+ * Needs to be cryptographically strong.
+ */
+
+static unsigned long seed = 0;
+static unsigned long count = 0;
+
+static unsigned long stir(unsigned long *a, unsigned long b){
+ pseudo_des(a, &b);
+ return b;
+}
+
+/** Get one random byte.
+ *
+ * @return random byte
+ */
+int get_random_byte(void){
+ return stir(&seed, ++count);
+}
+
+#if 0
+/* Get some random bytes.
+ *
+ * @param dst destination for the bytes
+ * @param dst_n number of bytes to get
+ */
+void get_random_bytes(void *dst, int dst_n){
+ int i;
+ char *p = (char *)dst;
+ for(i = 0; i < dst_n; i++){
+ *p++ = get_random_byte();
+ }
+}
+#endif
+
+/** Contribute a random byte.
+ *
+ * @param b byte to contribute
+ */
+void add_random_byte(int b){
+ stir(&seed, ++count);
+ stir(&seed, b);
+}
+
+/** Contribute some random bytes.
+ *
+ * @param src bytes to contribute
+ * @param src_n number of bytes
+ */
+void add_random_bytes(const void *src, int src_n){
+ int i;
+ char *p = (char *)src;
+ for(i = 0; i < src_n; i++){
+ add_random_byte(*p++);
+ }
+}
+
+int __init random_module_init(void){
+ int dummy;
+ int tmp = jiffies;
+ seed = (unsigned long)&dummy;
+ add_random_byte(tmp);
+ return 0;
+}
+
+void __exit random_module_exit(void){
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef __VNET_RANDOM_H__
+#define __VNET_RANDOM_H__
+
+extern int get_random_byte(void);
+extern void get_random_bytes(void *dst, int dst_n);
+extern void add_random_byte(int b);
+extern void add_random_bytes(const void *src, int src_n);
+
+extern int random_module_init(void);
+extern void random_module_exit(void);
+
+#endif /* ! __VNET_RANDOM_H__ */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+
+#include <sa.h>
+#include <sa_algorithm.h>
+#include "hash_table.h"
+#include "allocate.h"
+
+#define MODULE_NAME "IPSEC"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+/** @file IPSEC Security Association (SA).
+ */
+
+/** Maximum number of protocols.*/
+#define INET_PROTOCOL_MAX 256
+
+/** Table of SA types indexed by protocol. */
+static SAType *sa_type[INET_PROTOCOL_MAX] = {};
+
+/** Hash a protocol number.
+ *
+ * @param protocol protocol number
+ * @return hashcode
+ */
+static inline unsigned char InetProtocol_hash(int protocol){
+ return (protocol) & (INET_PROTOCOL_MAX - 1);
+}
+
+/** Register an SA type.
+ * It is an error if an SA type is already registered for the protocol.
+ *
+ * @param type SA type
+ * @return 0 on success, error code otherwise
+ */
+int SAType_add(SAType *type){
+ int err = -EINVAL;
+ int hash;
+ if(!type) goto exit;
+ hash = InetProtocol_hash(type->protocol);
+ if(sa_type[hash]) goto exit;
+ err = 0;
+ sa_type[hash] = type;
+ exit:
+ return err;
+}
+
+/** Deregister an SA type.
+ * It is an error if no SA type is registered for the protocol.
+ *
+ * @param type SA type
+ * @return 0 on success, error code otherwise
+ */
+int SAType_del(SAType *type){
+ int err = -EINVAL;
+ int hash;
+ if(!type) goto exit;
+ hash = InetProtocol_hash(type->protocol);
+ if(!sa_type[hash]) goto exit;
+ err = 0;
+ sa_type[hash] = NULL;
+ exit:
+ return err;
+}
+
+int SAType_get(int protocol, SAType **type){
+ int err = -ENOENT;
+ int hash;
+ hash = InetProtocol_hash(protocol);
+ *type = sa_type[hash];
+ if(!*type) goto exit;
+ err = 0;
+ exit:
+ return err;
+}
+
+/* Defeat compiler warnings about unused functions. */
+static int sa_key_check(SAKey *key, enum sa_alg_type type) __attribute__((unused));
+static u32 random_spi(void) __attribute__((unused));
+static u32 generate_key(u32 key, u32 offset, u32 spi) __attribute__((unused));
+
+/** Check a key has an acceptable length for an algorithm.
+ *
+ * @param key key
+ * @param type algorithm
+ * @return 0 on success, error code otherwise
+ */
+static int sa_key_check(SAKey *key, enum sa_alg_type type){
+ return 0;
+}
+
+static unsigned long sa_spi_counter = 0;
+
+/** Generate a random spi.
+ * Uses a hashed counter.
+ *
+ * @return spi
+ */
+static u32 random_spi(void){
+ unsigned long left, right = 0;
+ u32 spi;
+ do{
+ left = sa_spi_counter++;
+ pseudo_des(&left, &right);
+ spi = right;
+ } while(!spi);
+ return spi;
+}
+
+/** Mangle some input to generate output.
+ * This is used to derive spis and keying material from secrets,
+ * so it probably ought to be cryptographically strong.
+ * Probably ought to use a good hash (sha1) or cipher (aes).
+ *
+ * @param input input values
+ * @param n number of values
+ * @return mangled value
+ */
+static u32 mangle(u32 input[], int n){
+ unsigned long left = 0, right = 0;
+ int i;
+ for(i=0; i<n; i++){
+ left ^= input[i];
+ pseudo_des(&left, &right);
+ }
+ return (u32)right;
+}
+
+/** Generate a spi for a given protocol and address, using a secret key.
+ * The offset is used when it is necessary to generate more than one spi
+ * for the same protocol and address.
+ *
+ * @param key key
+ * @param offset offset
+ * @param protocol protocol
+ * @param addr IP address
+ * @return spi
+ */
+static u32 generate_spi(u32 key, u32 offset, u32 protocol, u32 addr){
+ u32 input[] = { key, offset, protocol, addr };
+ u32 spi;
+ dprintf(">\n");
+ spi = mangle(input, 4);
+ dprintf("< spi=%x\n", spi);
+ return spi;
+}
+
+/** Generate keying material for a given spi, based on a
+ * secret.
+ *
+ * @param key secret
+ * @param offset offset
+ * @param spi spi
+ * @return keying material
+ */
+static u32 generate_key(u32 key, u32 offset, u32 spi){
+ u32 input[] = { key, offset, spi };
+ return mangle(input, 3);
+}
+
+/** Allocate a spi.
+ * Want to use random ones.
+ * So check for ones not in use.
+ *
+ * When using static keying, both ends need to agree on key.
+ * How does that work? Also, will suddenly get traffic using a spi,
+ * and will have to create SA then. Or need to create in advance.
+ * But can't do that because don't know peers.
+ * When get message on a spi that doesn't exist - do what?
+ * Use a spi related to the destination addr and a secret.
+ * Then receiver can check if spi is ok and create SA on demand.
+ * Use hash of key, protocol, addr to generate. Then have to check
+ * for in-use because of potential collisions. Receiver can do the
+ * same hash and check spi is in usable range. Then derive keys from
+ * the spi (using another secret).
+ *
+ * @param key spi generation key
+ * @param protocol protocol
+ * @param addr IP address
+ * @param spip return parameter for spi
+ * @return 0 on success, error code otherwise
+ */
+int sa_spi_alloc(u32 key, u32 protocol, u32 addr, u32 *spip){
+ int err = 0;
+ int i = 0, n = 100;
+ u32 spi;
+ for(i = 0; i < n; i++, spi++){
+ spi = generate_spi(key, i, protocol, addr);
+ if(!spi) continue;
+ if(!sa_table_lookup_spi(spi, protocol, addr)){
+ *spip = spi;
+ goto exit;
+ }
+ }
+ err = -ENOMEM;
+ exit:
+ return err;
+}
+
+/** Table of SAs. Indexed by unique id and spi/protocol/addr triple.
+ */
+static HashTable *sa_table = NULL;
+
+static u32 sa_id = 1;
+
+/** Hash an SA id.
+ *
+ * @param id SA id
+ * @return hashcode
+ */
+static inline Hashcode sa_table_hash_id(u32 id){
+ return hash_ul(id);
+}
+
+/** Hash SA spi/protocol/addr.
+ *
+ * @param spi spi
+ * @param protocol protocol
+ * @param addr IP address
+ * @return hashcode
+ */
+static inline Hashcode sa_table_hash_spi(u32 spi, u32 protocol, u32 addr){
+ Hashcode h = 0;
+ h = hash_2ul(spi, protocol);
+ h = hash_hul(h, addr);
+ return h;
+}
+
+/** Test if an SA entry has a given value.
+ *
+ * @param arg contains SA pointer
+ * @param table hashtable
+ * @param entry entry containing SA
+ * @return 1 if it does, 0 otherwise
+ */
+static int sa_table_state_fn(TableArg arg, HashTable *table, HTEntry *entry){
+ return entry->value == arg.ptr;
+}
+
+/** Test if an SA entry has a given id.
+ *
+ * @param arg contains SA id
+ * @param table hashtable
+ * @param entry entry containing SA
+ * @return 1 if it does, 0 otherwise
+ */
+static int sa_table_id_fn(TableArg arg, HashTable *table, HTEntry *entry){
+ SAState *state = entry->value;
+ u32 id = arg.ul;
+ return state->ident.id == id;
+}
+
+/** Test if an SA entry has a given spi/protocol/addr.
+ *
+ * @param arg contains SAIdent pointer
+ * @param table hashtable
+ * @param entry entry containing SA
+ * @return 1 if it does, 0 otherwise
+ */
+static int sa_table_spi_fn(TableArg arg, HashTable *table, HTEntry *entry){
+ SAState *state = entry->value;
+ SAIdent *ident = arg.ptr;
+ return state->ident.spi == ident->spi
+ && state->ident.protocol == ident->protocol
+ && state->ident.addr == ident->addr;
+}
+
+/** Free an SA entry. Decrements the SA refcount and frees the entry.
+ *
+ * @param table containing table
+ * @param entry to free
+ */
+void sa_table_free_fn(HashTable *table, HTEntry *entry){
+ if(!entry) return;
+ if(entry->value){
+ SAState *state = entry->value;
+ SAState_decref(state);
+ }
+ deallocate(entry);
+}
+
+/** Initialize the SA table.
+ *
+ * @return 0 on success, error code otherwise
+ */
+int sa_table_init(void){
+ int err = 0;
+ sa_table = HashTable_new(0);
+ if(!sa_table){
+ err = -ENOMEM;
+ goto exit;
+ }
+ sa_table->entry_free_fn = sa_table_free_fn;
+
+ exit:
+ return err;
+}
+
+void sa_table_exit(void){
+ HashTable_free(sa_table);
+}
+
+/** Remove an SA from the table.
+ *
+ * @param state SA
+ */
+int sa_table_delete(SAState *state){
+ int count = 0;
+ Hashcode h1, h2;
+ TableArg arg = { .ptr = state };
+ // Remove by id.
+ h1 = sa_table_hash_id(state->ident.id);
+ count += HashTable_remove_entry(sa_table, h1, sa_table_state_fn, arg);
+ // Remove by spi/protocol/addr if spi nonzero.
+ if(!state->ident.spi) goto exit;
+ h2 = sa_table_hash_spi(state->ident.spi, state->ident.protocol, state->ident.addr);
+ if(h1 == h2) goto exit;
+ count += HashTable_remove_entry(sa_table, h2, sa_table_state_fn, arg);
+ exit:
+ return count;
+}
+
+/** Add an SA to the table.
+ * The SA is indexed by id and spi/protocol/addr (if the spi is non-zero).
+ *
+ * @param state SA
+ * @return 0 on success, error code otherwise
+ */
+int sa_table_add(SAState *state){
+ int err = 0;
+ Hashcode h1, h2;
+ int entries = 0;
+
+ dprintf(">\n");
+ // Index by id.
+ h1 = sa_table_hash_id(state->ident.id);
+ if(!HashTable_add_entry(sa_table, h1, HKEY(state->ident.id), state)){
+ err = -ENOMEM;
+ goto exit;
+ }
+ entries++;
+ SAState_incref(state);
+ // Index by spi/protocol/addr if spi non-zero.
+ if(state->ident.spi){
+ h2 = sa_table_hash_spi(state->ident.spi, state->ident.protocol, state->ident.addr);
+ if(h1 != h2){
+ if(!HashTable_add_entry(sa_table, h2, HKEY(state->ident.id), state)){
+ err = -ENOMEM;
+ goto exit;
+ }
+ entries++;
+ SAState_incref(state);
+ }
+ }
+ exit:
+ if(err && entries){
+ sa_table_delete(state);
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+
+/** Find an SA by spi/protocol/addr.
+ * Increments the SA refcount on success.
+ *
+ * @param spi spi
+ * @param protocol protocol
+ * @param addr IP address
+ * @return SA or NULL
+ */
+SAState * sa_table_lookup_spi(u32 spi, u32 protocol, u32 addr){
+ SAState *state = NULL;
+ Hashcode h;
+ SAIdent id = {
+ .spi = spi,
+ .protocol = protocol,
+ .addr = addr };
+ TableArg arg = { .ptr = &id };
+ HTEntry *entry = NULL;
+
+ h = sa_table_hash_spi(spi, protocol, addr);
+ entry = HashTable_find_entry(sa_table, h, sa_table_spi_fn, arg);
+ if(entry){
+ state = entry->value;
+ SAState_incref(state);
+ }
+ return state;
+}
+
+/** Find an SA by unique id.
+ * Increments the SA refcount on success.
+ *
+ * @param id id
+ * @return SA or NULL
+ */
+SAState * sa_table_lookup_id(u32 id){
+ Hashcode h;
+ TableArg arg = { .ul = id };
+ HTEntry *entry = NULL;
+ SAState *state = NULL;
+
+ dprintf("> id=%u\n", id);
+ h = sa_table_hash_id(id);
+ entry = HashTable_find_entry(sa_table, h, sa_table_id_fn, arg);
+ if(entry){
+ state = entry->value;
+ SAState_incref(state);
+ }
+ dprintf("< state=%p\n", state);
+ return state;
+}
+
+/** Replace an existing SA by another in the table.
+ * The existing SA is not removed if the new one cannot be added.
+ *
+ * @param existing SA to replace
+ * @param state new SA
+ * @return 0 on success, error code otherwise
+ */
+static int sa_table_replace(SAState *existing, SAState *state){
+ int err = 0;
+ // Need check for in-use?
+
+ dprintf(">\n");
+ if(existing->keying.state != SA_STATE_ACQUIRE){
+ err = -EINVAL;
+ goto exit;
+ }
+ // replace it.
+ err = sa_table_add(state);
+ if(err) goto exit;
+ sa_table_delete(existing);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Allocate an SA.
+ *
+ * @return SA or NULL
+ */
+SAState *SAState_alloc(void){
+ SAState *state;
+
+ dprintf(">\n");
+ state = kmalloc(sizeof(SAState), GFP_ATOMIC);
+ if(!state) goto exit;
+ *state = (SAState){};
+ atomic_set(&state->refcount, 1);
+ state->lock = SPIN_LOCK_UNLOCKED;
+ exit:
+ dprintf("< state=%p\n", state);
+ return state;
+}
+
+/** Create an SA in initial state.
+ * It has no spi and its keying state is acquire.
+ * It must have a unique id, protocol and address.
+ * At some point it should get updated with a complete SA.
+ *
+ * @param ident SA identifier
+ * @param statep return parameter for new SA
+ * @return 0 on success, error code otherwise
+ */
+int SAState_init(SAIdent *ident, SAState **statep){
+ int err = 0;
+ SAState *state = NULL;
+
+ if(ident->spi || !ident->id){
+ err = -EINVAL;
+ goto exit;
+ }
+ state = SAState_alloc();
+ if (!state){
+ err = -ENOMEM;
+ goto exit;
+ }
+ state->ident = *ident;
+ state->keying.state = SA_STATE_ACQUIRE;
+ exit:
+ return err;
+}
+
+/** Create a complete SA, with spi and cipher suite.
+ *
+ * @param info SA parameters
+ * @param statep return parameter for new SA
+ * @return 0 on success, error code otherwise
+ */
+int SAState_create(SAInfo *info, SAState **statep){
+ int err = 0;
+ SAState *state = NULL;
+
+ dprintf(">\n");
+ state = SAState_alloc();
+ if (!state){
+ err = -ENOMEM;
+ goto exit;
+ }
+ state->ident = info->ident;
+ state->limits = info->limits;
+ state->digest = info->digest;
+ state->cipher = info->cipher;
+ state->compress = info->compress;
+ state->security = info->security;
+ err = SAType_get(state->ident.protocol, &state->type);
+ if (err) goto exit;
+ err = state->type->init(state, NULL);
+ if (err) goto exit;
+ state->keying.state = SA_STATE_VALID;
+ exit:
+ if(err){
+ SAState_decref(state);
+ state = NULL;
+ }
+ *statep = state;
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Create an SA for the given spi etc.
+ * For now we fix the cipher suite and the keys.
+ * Digest is SHA1 HMAC with a 128-bit key.
+ * Cipher is AES (Rijndael) in CBC mode with a 128-bit key.
+ *
+ * The cipher suite and keys should really come from policy, with the
+ * possibility of negotiating them with the peer (using IKE).
+ * Negotiation creates difficulties though - because the SA cannot
+ * be created immediately we have to be able to queue packets
+ * while the SA is being negotiated.
+ *
+ * @param spi spi
+ * @param protocol protocol
+ * @param addr address
+ * @param sa return parameter for SA
+ * @return 0 on success, error code otherwise
+ */
+int sa_create(int security, u32 spi, u32 protocol, u32 addr, SAState **sa){
+ int err = 0;
+ SAInfo info = {};
+ char *digest_name = "sha1";
+ char *digest_key = "0123456789abcdef";
+ int digest_key_n = strlen(digest_key);
+ char *cipher_name= "aes";
+ char *cipher_key = "0123456789ABCDEF";
+ int cipher_key_n = strlen(cipher_key);
+
+ dprintf("> security=%d spi=%u protocol=%u addr=" IPFMT "\n",
+ security, spi, protocol, NIPQUAD(addr));
+ if(!spi){
+ spi = generate_spi(0, 0, protocol, addr);
+ }
+ dprintf("> info...\n");
+ info.ident.id = sa_id++;
+ info.ident.spi = spi;
+ info.ident.protocol = protocol;
+ info.ident.addr = addr;
+ info.security = security;
+
+ //sa_algorithm_probe_all();
+
+ dprintf("> digest name=%s key_n=%d\n", digest_name, digest_key_n);
+ strcpy(info.digest.name, digest_name);
+ info.digest.bits = digest_key_n * 8;
+ memcpy(info.digest.key, digest_key, digest_key_n);
+
+ if(security & SA_CONF){
+ dprintf("> cipher name=%s key_n=%d\n", cipher_name, cipher_key_n);
+ strcpy(info.cipher.name, cipher_name);
+ info.cipher.bits = cipher_key_n * 8;
+ memcpy(info.cipher.key, cipher_key, cipher_key_n);
+ } else {
+ dprintf("> cipher name=%s key_n=%d\n", "cipher_null", 0);
+ strcpy(info.cipher.name, "cipher_null");
+ info.cipher.bits = 0;
+ memset(info.cipher.key, 0, sizeof(info.cipher.key));
+ }
+
+ err = sa_set(&info, 0, sa);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Create or update an SA.
+ * The SA is added to the table.
+ *
+ * @param info SA parameters
+ * @param update create if zero, update otherwise
+ * @return 0 on success, error code otherwise
+ */
+int sa_set(SAInfo *info, int update, SAState **val){
+ int err = 0;
+ SAState *state = NULL;
+ SAState *existing = NULL;
+
+ dprintf("> info=%p update=%d val=%p\n", info, update, val);
+ existing = sa_table_lookup_id(info->ident.id);
+ if(update && !existing){
+ err = -ENOENT;
+ } else if(!update && existing){
+ err = -EINVAL;
+ }
+ if(err) goto exit;
+ err = SAState_create(info, &state);
+ if (err) goto exit;
+ if(existing){
+ err = sa_table_replace(existing, state);
+ } else {
+ err = sa_table_add(state);
+ }
+ exit:
+ if(existing) SAState_decref(existing);
+ if(val && !err){
+ *val = state;
+ } else {
+ SAState_decref(state);
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Delete an SA. Removes it from the SA table.
+ * It is an error if no SA with the given id exists.
+ *
+ * @param id SA id
+ * @return 0 on success, error code otherwise
+ */
+int sa_delete(int id){
+ int err = 0;
+ SAState *state;
+ state = sa_table_lookup_id(id);
+ if (!state){
+ err = -ENOENT;
+ goto exit;
+ }
+ sa_table_delete(state);
+ SAState_decref(state);
+ exit:
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef __VNET_SA_H__
+#define __VNET_SA_H__
+
+#include <linux/types.h>
+#include <linux/crypto.h>
+
+#include <tunnel.h>
+
+#ifndef CRYPTO_MAX_KEY_BYTES
+#define CRYPTO_MAX_KEY_BYTES 64
+#define CRYPTO_MAX_KEY_BITS (CRYPTO_MAX_KEY_BYTES * 8)
+#endif
+
+typedef struct SALimits {
+ u64 bytes_soft;
+ u64 bytes_hard;
+ u64 packets_soft;
+ u64 packets_hard;
+} SALimits;
+
+typedef struct SACounts {
+ u64 bytes;
+ u64 packets;
+ u32 integrity_failures;
+} SACounts;
+
+typedef struct SAReplay {
+ int replay;
+ u32 send_seq;
+ u32 recv_seq;
+ u32 bitmap;
+ u32 replay_window;
+} SAReplay;
+
+typedef struct SAKey {
+ char name[CRYPTO_MAX_ALG_NAME];
+ int bits;
+ char key[CRYPTO_MAX_KEY_BYTES];
+} SAKey;
+
+typedef struct SAKeying {
+ u8 state;
+ u8 dying;
+} SAKeying;
+
+typedef struct SAIdent {
+ u32 id;
+ u32 spi;
+ u32 addr;
+ u32 protocol;
+} SAIdent;
+
+struct SAType;
+
+/** Security assocation (SA). */
+typedef struct SAState {
+ atomic_t refcount;
+ spinlock_t lock;
+ /** Identifier. */
+ struct SAIdent ident;
+ /** Security flags. */
+ int security;
+ /** Keying state. */
+ struct SAKeying keying;
+ /** Byte counts etc. */
+ struct SACounts counts;
+ /** Byte limits etc. */
+ struct SALimits limits;
+ /** Replay protection. */
+ struct SAReplay replay;
+ /** Digest algorithm. */
+ struct SAKey digest;
+ /** Cipher algorithm. */
+ struct SAKey cipher;
+ /** Compress algorith. */
+ struct SAKey compress;
+ /** SA type (ESP, AH). */
+ struct SAType *type;
+ /** Data for the SA type to use. */
+ void *data;
+} SAState;
+
+typedef struct SAType {
+ char *name;
+ int protocol;
+ int (*init)(SAState *state, void *args);
+ void (*fini)(SAState *state);
+ int (*recv)(SAState *state, struct sk_buff *skb);
+ int (*send)(SAState *state, struct sk_buff *skb, Tunnel *tunnel);
+ u32 (*size)(SAState *state, int size);
+} SAType;
+
+/** Information needed to create an SA.
+ * Unused algorithms have zero key size.
+ */
+typedef struct SAInfo {
+ /** Identifier. */
+ SAIdent ident;
+ /** Security flags. */
+ int security;
+ /** Digest algorithm and key. */
+ SAKey digest;
+ /** Cipher algorithm and key. */
+ SAKey cipher;
+ /** Compress algorithm and key. */
+ SAKey compress;
+ /** SA lifetime limits. */
+ SALimits limits;
+ /** Replay protection window. */
+ int replay_window;
+} SAInfo;
+
+enum sa_alg_type {
+ SA_ALG_DIGEST = 1,
+ SA_ALG_CIPHER = 2,
+ SA_ALG_COMPRESS = 3,
+};
+
+extern int SAType_add(SAType *type);
+extern int SAType_del(SAType *type);
+extern int SAType_get(int protocol, SAType **type);
+
+extern int sa_table_init(void);
+extern void sa_table_exit(void);
+extern int sa_table_delete(SAState *state);
+extern int sa_table_add(SAState *state);
+extern SAState * sa_table_lookup_spi(u32 spi, u32 protocol, u32 addr);
+extern SAState * sa_table_lookup_id(u32 id);
+
+/** Increment reference count.
+ *
+ * @param sa security association (may be null)
+ */
+static inline void SAState_incref(SAState *sa){
+ if(!sa) return;
+ atomic_inc(&sa->refcount);
+}
+
+/** Decrement reference count, freeing if zero.
+ *
+ * @param sa security association (may be null)
+ */
+static inline void SAState_decref(SAState *sa){
+ if(!sa) return;
+ if(atomic_dec_and_test(&sa->refcount)){
+ sa->type->fini(sa);
+ kfree(sa);
+ }
+}
+
+extern SAState *SAState_alloc(void);
+extern int SAState_init(SAIdent *id, SAState **statep);
+extern int SAState_create(SAInfo *info, SAState **statep);
+
+static inline int SAState_send(SAState *sa, struct sk_buff *skb, Tunnel *tunnel){
+ return sa->type->send(sa, skb, tunnel);
+}
+
+static inline int SAState_recv(SAState *sa, struct sk_buff *skb){
+ return sa->type->recv(sa, skb);
+}
+
+static inline int SAState_size(SAState *sa, int n){
+ return sa->type->size(sa, n);
+}
+
+extern int sa_create(int security, u32 spi, u32 protocol, u32 addr, SAState **sa);
+extern int sa_set(SAInfo *info, int update, SAState **val);
+extern int sa_delete(int id);
+
+enum {
+ SA_AUTH = 1,
+ SA_CONF = 2
+};
+
+enum {
+ SA_STATE_ACQUIRE = 1,
+ SA_STATE_VALID = 2,
+};
+
+#endif /* !__VNET_SA_H__ */
--- /dev/null
+/*
+ * Copyright (c) 2002 James Morris <jmorris@intercode.com.au>
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/crypto.h>
+#include <linux/sched.h>
+//#include <asm/softirq.h>
+
+#include <sa_algorithm.h>
+
+#define MODULE_NAME "IPSEC"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+/** @file Tables of supported IPSEC algorithms.
+ * Has tables for digests, ciphers and compression algorithms.
+ */
+
+/*
+ * Algorithms supported by IPsec. These entries contain properties which
+ * are used in key negotiation and sa processing, and are used to verify
+ * that instantiated crypto transforms have correct parameters for IPsec
+ * purposes.
+ */
+
+/** Digests. */
+static SAAlgorithm digest_alg[] = {
+ {
+ .name = "digest_null",
+ .info = {
+ .digest = {
+ .icv_truncbits = 0,
+ .icv_fullbits = 0,
+ }
+ },
+ .alg = {
+ .sadb_alg_id = SADB_X_AALG_NULL,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 0,
+ .sadb_alg_maxbits = 0
+ }
+ },
+ {
+ .name = "md5",
+ .info = { .digest = {
+ .icv_truncbits = 96,
+ .icv_fullbits = 128,
+ } },
+ .alg = {
+ .sadb_alg_id = SADB_AALG_MD5HMAC,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 128
+ }
+ },
+ {
+ .name = "sha1",
+ .info = {
+ .digest = {
+ .icv_truncbits = 96,
+ .icv_fullbits = 160,
+ }
+ },
+ .alg = {
+ .sadb_alg_id = SADB_AALG_SHA1HMAC,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 160,
+ .sadb_alg_maxbits = 160
+ }
+ },
+ {
+ .name = "sha256",
+ .info = {
+ .digest = {
+ .icv_truncbits = 128,
+ .icv_fullbits = 256,
+ }
+ },
+ .alg = {
+ .sadb_alg_id = SADB_X_AALG_SHA2_256HMAC,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 256,
+ .sadb_alg_maxbits = 256
+ }
+ },
+/* { */
+/* .name = "ripemd160", */
+/* .info = { */
+/* .digest = { */
+/* .icv_truncbits = 96, */
+/* .icv_fullbits = 160, */
+/* } */
+/* }, */
+/* .alg = { */
+/* .sadb_alg_id = SADB_X_AALG_RIPEMD160HMAC, */
+/* .sadb_alg_ivlen = 0, */
+/* .sadb_alg_minbits = 160, */
+/* .sadb_alg_maxbits = 160 */
+/* } */
+/* }, */
+ { /* Terminator */ }
+};
+
+/** Ciphers. */
+static SAAlgorithm cipher_alg[] = {
+ {
+ .name = "cipher_null",
+ .info = {
+ .cipher = {
+ .blockbits = 8,
+ .defkeybits = 0,
+ }
+ },
+ .alg = {
+ .sadb_alg_id = SADB_EALG_NULL,
+ .sadb_alg_ivlen = 0,
+ .sadb_alg_minbits = 0,
+ .sadb_alg_maxbits = 0
+ }
+ },
+ {
+ .name = "des",
+ .info = {
+ .cipher = {
+ .blockbits = 64,
+ .defkeybits = 64,
+ }
+ },
+ .alg = {
+ .sadb_alg_id = SADB_EALG_DESCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 64,
+ .sadb_alg_maxbits = 64
+ }
+ },
+ {
+ .name = "des3_ede",
+ .info = {
+ .cipher = {
+ .blockbits = 64,
+ .defkeybits = 192,
+ }
+ },
+ .alg = {
+ .sadb_alg_id = SADB_EALG_3DESCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 192,
+ .sadb_alg_maxbits = 192
+ }
+ },
+/* { */
+/* .name = "cast128", */ //cast5?
+/* .info = { */
+/* .cipher = { */
+/* .blockbits = 64, */
+/* .defkeybits = 128, */
+/* } */
+/* }, */
+/* .alg = { */
+/* .sadb_alg_id = SADB_X_EALG_CASTCBC, */
+/* .sadb_alg_ivlen = 8, */
+/* .sadb_alg_minbits = 40, */
+/* .sadb_alg_maxbits = 128 */
+/* } */
+/* }, */
+ {
+ .name = "blowfish",
+ .info = {
+ .cipher = {
+ .blockbits = 64,
+ .defkeybits = 128,
+ }
+ },
+ .alg = {
+ .sadb_alg_id = SADB_X_EALG_BLOWFISHCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 40,
+ .sadb_alg_maxbits = 448
+ }
+ },
+ {
+ .name = "aes",
+ .info = {
+ .cipher = {
+ .blockbits = 128,
+ .defkeybits = 128,
+ }
+ },
+ .alg = {
+ .sadb_alg_id = SADB_X_EALG_AESCBC,
+ .sadb_alg_ivlen = 8,
+ .sadb_alg_minbits = 128,
+ .sadb_alg_maxbits = 256
+ }
+ },
+ { /* Terminator */ }
+};
+
+/** Compressors. */
+static SAAlgorithm compress_alg[] = {
+ {
+ .name = "deflate",
+ .info = {
+ .compress = {
+ .threshold = 90,
+ }
+ },
+ .alg = { .sadb_alg_id = SADB_X_CALG_DEFLATE }
+ },
+/* { */
+/* .name = "lzs", */
+/* .info = { */
+/* .compress = { */
+/* .threshold = 90, */
+/* } */
+/* }, */
+/* .alg = { .sadb_alg_id = SADB_X_CALG_LZS } */
+/* }, */
+/* { */
+/* .name = "lzjh", */
+/* .info = { */
+/* .compress = { */
+/* .threshold = 50, */
+/* } */
+/* }, */
+/* .alg = { .sadb_alg_id = SADB_X_CALG_LZJH } */
+/* }, */
+ { /* Terminator */ }
+};
+
+static SAAlgorithm *sa_algorithm_by_id(SAAlgorithm *algo, int alg_id) {
+ for( ; algo && algo->name; algo++){
+ if (algo->alg.sadb_alg_id == alg_id) {
+ return (algo->available ? algo : NULL);
+ }
+ }
+ return NULL;
+}
+
+
+static SAAlgorithm *sa_algorithm_by_name(SAAlgorithm *algo, char *name) {
+ if (!name) return NULL;
+ for( ; algo && algo->name; algo++){
+ if (strcmp(name, algo->name) == 0) {
+ return (algo->available ? algo : NULL);
+ }
+ }
+ return NULL;
+}
+
+SAAlgorithm *sa_digest_by_id(int alg_id) {
+ return sa_algorithm_by_id(digest_alg, alg_id);
+}
+
+SAAlgorithm *sa_cipher_by_id(int alg_id) {
+ return sa_algorithm_by_id(cipher_alg, alg_id);
+}
+
+SAAlgorithm *sa_compress_by_id(int alg_id) {
+ return sa_algorithm_by_id(compress_alg, alg_id);
+}
+
+SAAlgorithm *sa_digest_by_name(char *name) {
+ return sa_algorithm_by_name(digest_alg, name);
+}
+
+SAAlgorithm *sa_cipher_by_name(char *name) {
+ return sa_algorithm_by_name(cipher_alg, name);
+}
+
+SAAlgorithm *sa_compress_by_name(char *name) {
+ return sa_algorithm_by_name(compress_alg, name);
+}
+
+SAAlgorithm *sa_digest_by_index(unsigned int idx) {
+ return digest_alg + idx;
+}
+
+SAAlgorithm *sa_cipher_by_index(unsigned int idx) {
+ return cipher_alg + idx;
+}
+
+SAAlgorithm *sa_compress_by_index(unsigned int idx) {
+ return compress_alg + idx;
+}
+
+static void sa_algorithm_probe(SAAlgorithm *algo){
+ int status;
+ dprintf("> algo=%p\n", algo);
+ for( ; algo && algo->name; algo++){
+ dprintf("> algorithm %s...\n", algo->name);
+ status = crypto_alg_available(algo->name, 0);
+ dprintf("> algorithm %s status=%d\n",algo->name, status);
+ if (algo->available != status){
+ algo->available = status;
+ }
+ }
+ dprintf("<\n");
+}
+
+/** Crypto api is broken. When an unregistered algorithm is requested it
+ * tries to load a module of the same name. But not all algorithms are
+ * defined by modules of the same name.
+ */
+static char *crypto_modules[] = {
+ "aes",
+ //"arc4",
+ "blowfish",
+ //"cast5",
+ //"cast6",
+ "crypto_null",
+ "des",
+ //"md4",
+ "md5",
+ //"serpent",
+ "sha1",
+ "sha256",
+ //"sha512",
+ //"twofish",
+ NULL
+};
+
+#include <linux/kmod.h>
+
+static void sa_module_probe(char **modules){
+ char **p;
+ dprintf(">\n");
+ for(p = modules; *p; p++){
+ dprintf("> %s\n", *p);
+ request_module(*p);
+ }
+ dprintf("<\n");
+}
+
+/**
+ * Probe for the availability of crypto algorithms, and set the available
+ * flag for any algorithms found on the system. This is typically called by
+ * pfkey during userspace SA add, update or register.
+ */
+void sa_algorithm_probe_all(void){
+ dprintf("> \n");
+ //BUG_ON(in_softirq());
+ sa_module_probe(crypto_modules);
+ sa_algorithm_probe(digest_alg);
+ sa_algorithm_probe(cipher_alg);
+ sa_algorithm_probe(compress_alg);
+ dprintf("<\n");
+}
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef __VNET_SA_ALGORITHM_H__
+#define __VNET_SA_ALGORITHM_H__
+
+#include <linux/types.h>
+#include <linux/pfkeyv2.h>
+
+typedef struct SADigestInfo {
+ u16 icv_truncbits;
+ u16 icv_fullbits;
+} SADigestInfo;
+
+typedef struct SACipherInfo {
+ u16 blockbits;
+ u16 defkeybits;
+} SACipherInfo;
+
+typedef struct SACompressInfo {
+ u16 threshold;
+} SACompressInfo;
+
+typedef struct SAAlgorithm {
+ char *name;
+ u8 available;
+ union {
+ SADigestInfo digest;
+ SACipherInfo cipher;
+ SACompressInfo compress;
+ } info;
+ struct sadb_alg alg;
+} SAAlgorithm;
+
+extern SAAlgorithm *sa_digest_by_id(int alg_id);
+extern SAAlgorithm *sa_cipher_by_id(int alg_id);
+extern SAAlgorithm *sa_compress_by_id(int alg_id);
+extern SAAlgorithm *sa_digest_by_name(char *name);
+extern SAAlgorithm *sa_cipher_by_name(char *name);
+extern SAAlgorithm *sa_compress_by_name(char *name);
+extern SAAlgorithm *sa_digest_by_index(unsigned int idx);
+extern SAAlgorithm *sa_cipher_by_index(unsigned int idx);
+extern SAAlgorithm *sa_compress_by_index(unsigned int idx);
+extern void sa_algorithm_probe_all(void);
+
+#define MAX_KEY_BITS 512
+
+#endif /* ! __VNET_SA_ALGORITHM_H__ */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+
+#include <skb_context.h>
+
+#define MODULE_NAME "VNET"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+SkbContext *SkbContext_create(u32 vnet, u32 addr, int protocol, void *data,
+ void (*free_fn)(SkbContext *)){
+ SkbContext *context = NULL;
+
+ context = kmalloc(sizeof(SkbContext), GFP_ATOMIC);
+ if(!context) goto exit;
+ context->vnet = vnet;
+ context->addr = addr;
+ context->protocol = protocol;
+ context->data = data;
+ context->free_fn = free_fn;
+ context->next = NULL;
+ atomic_set(&context ->refcount, 1);
+ exit:
+ return context;
+}
+
+void SkbContext_free(SkbContext *context){
+ if(!context) return;
+ if(context->next) SkbContext_decref(context->next);
+ if(context->free_fn) context->free_fn(context);
+ context->vnet = 0;
+ context->addr = 0;
+ context->protocol = 0;
+ context->free_fn = NULL;
+ context->data = NULL;
+ context->next = NULL;
+ kfree(context);
+}
+
+int SkbContext_push(SkbContext **val, u32 vnet, u32 addr, int protocol,
+ void *data, void (*free_fn)(SkbContext *)){
+ int err = 0;
+ SkbContext *context = NULL;
+
+ dprintf("> vnet=%u addr=%u.%u.%u.%u protocol=%d\n",
+ vnet, NIPQUAD(addr), protocol);
+ context = SkbContext_create(vnet, addr, protocol, data, free_fn);
+ if(!context){
+ err = -ENOMEM;
+ goto exit;
+ }
+ context->next = *val;
+ *val = context;
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+int skb_push_context(struct sk_buff *skb, u32 vnet, u32 addr, int protocol,
+ void *data, void (*free_fn)(SkbContext *)){
+ int err = 0;
+ //SkbContext *ctxt = SKB_CONTEXT(skb);
+ dprintf("> skb=%p\n", skb);
+
+ //err = SkbContext_push(&ctxt, vnet, addr, protocol, data, free_fn); //todo fixme
+ //SKB_CONTEXT(skb) = ctxt;//todo fixme
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef __VNET_SKB_CONTEXT_H__
+#define __VNET_SKB_CONTEXT_H__
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <asm/atomic.h>
+#include <linux/types.h>
+
+/** Structure used to record inbound processing path for skbs.
+ * For example, the ETHERIP protocol handler can use this to
+ * tell whether an inbound packet came through IPSEC ESP or not.
+ */
+typedef struct SkbContext {
+ u32 vnet;
+ u32 addr;
+ int protocol;
+ void *data;
+ void (*free_fn)(struct SkbContext *);
+ atomic_t refcount;
+ struct SkbContext *next;
+} SkbContext;
+
+/** Decrement the reference count, freeing if zero.
+ *
+ * @param context context (may be null)
+ */
+static inline void SkbContext_decref(SkbContext *context){
+ extern void SkbContext_free(SkbContext *context);
+ if(!context) return;
+ if(atomic_dec_and_test(&context->refcount)){
+ SkbContext_free(context);
+ }
+}
+
+/** Increment the reference count.
+ *
+ * @param context context (may be null)
+ */
+static inline void SkbContext_incref(SkbContext *context){
+ if(!context) return;
+ atomic_inc(&context->refcount);
+}
+
+extern SkbContext *SkbContext_create(u32 vnet, u32 addr, int protocol, void *data,
+ void (*free_fn)(SkbContext *));
+
+extern int SkbContext_push(SkbContext **val, u32 vnet, u32 addr, int protocol,
+ void *data, void (*free_fn)(SkbContext *));
+
+struct sk_buff;
+extern int skb_push_context(struct sk_buff *skb, u32 vnet, u32 addr, int protocol,
+ void *data, void (*free_fn)(SkbContext *));
+
+//todo: fixme
+#define SKB_CONTEXT(_skb) ((SkbContext *)(&(_skb)->cb[0]))
+
+#endif /* !__VNET_SKB_CONTEXT_H__ */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/version.h>
+
+#include <asm/scatterlist.h>
+#include <linux/crypto.h>
+#include <linux/pfkeyv2.h>
+#include <linux/random.h>
+
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+
+#include <varp.h>
+#include <skb_util.h>
+
+#define MODULE_NAME "VNET"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+static const int DEBUG_SCATTERLIST = 0;
+static const int DEBUG_SKB = 0;
+
+//============================================================================
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+#define SET_SCATTER_ADDR(sg, addr) do{} while(0)
+#else
+#define SET_SCATTER_ADDR(sg, addr) (sg).address = (addr)
+#endif
+
+/** Make enough room in an skb for extra header and trailer.
+ *
+ * @param pskb return parameter for expanded skb
+ * @param skb skb
+ * @param head_n required headroom
+ * @param tail_n required tailroom
+ * @return 0 on success, error code otherwise
+ */
+int skb_make_room(struct sk_buff **pskb, struct sk_buff *skb, int head_n, int tail_n){
+ int err = 0;
+ int has_headroom = (head_n <= skb_headroom(skb));
+ int has_tailroom = (tail_n <= skb_tailroom(skb));
+ int writeable = !skb_cloned(skb) && !skb_shared(skb);
+
+ dprintf("> skb=%p headroom=%d head_n=%d tailroom=%d tail_n=%d\n",
+ skb,
+ skb_headroom(skb), head_n,
+ skb_tailroom(skb), tail_n);
+ if(writeable && has_headroom && has_tailroom){
+ // There's room! Reuse it.
+ *pskb = skb;
+ } else if(writeable && has_tailroom){
+ // Tailroom, no headroom. Expand header the way GRE does.
+ struct sk_buff *new_skb = skb_realloc_headroom(skb, head_n + 16);
+ if(!new_skb){
+ err = -ENOMEM;
+ goto exit;
+ }
+ dev_kfree_skb(skb);
+ *pskb = new_skb;
+ } else {
+ // No room. Expand. There may be more efficient ways to do
+ // this, but this is simple and correct.
+ struct sk_buff *new_skb = skb_copy_expand(skb, head_n + 16, tail_n, GFP_ATOMIC);
+ if(!new_skb){
+ err = -ENOMEM;
+ goto exit;
+ }
+ dev_kfree_skb(skb);
+ *pskb = new_skb;
+ }
+ dprintf("> skb=%p headroom=%d head_n=%d tailroom=%d tail_n=%d\n",
+ *pskb,
+ skb_headroom(*pskb), head_n,
+ skb_tailroom(*pskb), tail_n);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Copy some data bits from a kernel buffer to an skb.
+ * Derived in the obvious way from skb_copy_bits().
+ */
+int skb_put_bits(const struct sk_buff *skb, int offset, void *src, int len)
+{
+ int i, copy;
+ int start = skb->len - skb->data_len;
+
+ if (offset > (int)skb->len-len)
+ goto fault;
+
+ /* Copy header. */
+ if ((copy = start-offset) > 0) {
+ if (copy > len)
+ copy = len;
+ memcpy(skb->data + offset, src, copy);
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ src += copy;
+ }
+
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ int end;
+
+ BUG_TRAP(start <= offset+len);
+
+ end = start + skb_shinfo(skb)->frags[i].size;
+ if ((copy = end-offset) > 0) {
+ u8 *vaddr;
+
+ if (copy > len)
+ copy = len;
+
+ vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
+ memcpy(vaddr + skb_shinfo(skb)->frags[i].page_offset + offset - start,
+ src,
+ copy);
+ kunmap_skb_frag(vaddr);
+
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ src += copy;
+ }
+ start = end;
+ }
+
+ if (skb_shinfo(skb)->frag_list) {
+ struct sk_buff *list;
+
+ for (list = skb_shinfo(skb)->frag_list; list; list=list->next) {
+ int end;
+
+ BUG_TRAP(start <= offset+len);
+
+ end = start + list->len;
+ if ((copy = end-offset) > 0) {
+ if (copy > len)
+ copy = len;
+ if (skb_put_bits(list, offset-start, src, copy))
+ goto fault;
+ if ((len -= copy) == 0)
+ return 0;
+ offset += copy;
+ src += copy;
+ }
+ start = end;
+ }
+ }
+ if (len == 0)
+ return 0;
+
+ fault:
+ return -EFAULT;
+}
+
+/** Add some space to the end of a (possibly fragmented) skb.
+ *
+ * Only works with Xen output skbs. Output skbs have 1 frag, and we
+ * add another frag for the extra space.
+ *
+ * @param skb skb
+ * @param n number of bytes to add
+ * @return 0 on success, error code otherwise
+ *
+ * @todo fixme
+ */
+int pskb_put(struct sk_buff *skb, int n){
+ int err = 0;
+ if(1 || skb_is_nonlinear(skb)){
+ struct skb_shared_info *info = skb_shinfo(skb);
+ char *ptr = NULL;
+
+ if(info->nr_frags >= MAX_SKB_FRAGS){
+ err = -ENOMEM;
+ goto exit;
+ }
+ ptr = kmalloc(n, GFP_ATOMIC);
+ if(!ptr){
+ err = -ENOMEM;
+ goto exit;
+ }
+ info->nr_frags++;
+ info->frags[info->nr_frags - 1].page = virt_to_page(ptr);
+ info->frags[info->nr_frags - 1].page_offset = ((unsigned long)ptr & ~PAGE_MASK);
+ info->frags[info->nr_frags - 1].size = n;
+
+ skb->data_len += n;
+ skb->len += n;
+ } else {
+ __skb_put(skb, n);
+ }
+ exit:
+ if(err) dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Print some bits of an skb.
+ *
+ * @param skb to print
+ * @param offset byte offset to start printing at
+ * @param n number of bytes to print
+ */
+void skb_print_bits(struct sk_buff *skb, int offset, int n){
+ int chunk = 16;
+ int i, k;
+ u8 buff[chunk];
+ if(!DEBUG_SKB) return;
+ while(n){
+ k = (n > chunk ? chunk : n);
+ skb_copy_bits(skb, offset, buff, k);
+ printk("%03d ", offset);
+ for(i=0; i<k; i++){
+ if(i == 8)printk(" ");
+ printk(":%02x", buff[i] & 0xff);
+ }
+ printk(" \n");
+ n -= k;
+ offset += k;
+ }
+}
+
+/** Print a buffer.
+ *
+ * @param buf to print
+ * @param n number of bytes to print
+ */
+void buf_print(char *buf, int n){
+ int i;
+ for(i=0; i<n; i++){
+ if( i % 16 == 0) printk("\n%04d ", i);
+ else if(i % 8 == 0) printk(" ");
+ printk(":%02x", buf[i] & 0xff);
+ }
+ printk(" %04d\n", n);
+}
+
+/** Remove some space from the tail of an skb.
+ *
+ * @todo fixme: Do we need to handle frags?
+ */
+void *skb_trim_tail(struct sk_buff *skb, int n){
+ skb->tail -= n;
+ skb->len -= n;
+ return skb->tail;
+}
+
+// #define BUG_TRAP(x)
+// if(!(x)){ printk("KERNEL: assertion (" #x ") failed at " __FILE__ "(%d)\n", __LINE__); }
+
+/** Convert a (possibly fragmented) skb into a scatter list.
+ *
+ * @param skb skb to convert
+ * @param sg scatterlist to set up
+ * @param sg_n size of sg on input, number of elements set on output
+ * @param offset offset into data to start at
+ * @param len number of bytes
+ * @return 0 on success, error code otherwise
+ */
+int skb_scatterlist(struct sk_buff *skb, struct scatterlist *sg, int *sg_n,
+ int offset, int len){
+ int err = 0;
+ int start; // No. of bytes copied so far (where next copy starts).
+ int size; // Size of the next chunk.
+ int end; // Where the next chunk ends (start + size).
+ int copy; // Number of bytes to copy in one operation.
+ int sg_i = 0; // Index into sg.
+ int i;
+
+ if(DEBUG_SCATTERLIST){
+ dprintf("> offset=%d len=%d (end=%d), skb len=%d,\n",
+ offset, len, offset+len, skb->len);
+ }
+ start = 0;
+ size = skb_headlen(skb);
+ end = start + size;
+ copy = end - offset;
+ if(copy > 0){
+ char *p;
+ if(copy > len) copy = len;
+ if(sg_i >= *sg_n){
+ err = -EINVAL;
+ goto exit;
+ }
+ p = skb->data + offset;
+ SET_SCATTER_ADDR(sg[sg_i], NULL);
+ sg[sg_i].page = virt_to_page(p);
+ sg[sg_i].offset = ((unsigned long)p & ~PAGE_MASK);
+ sg[sg_i].length = copy;
+ if(DEBUG_SCATTERLIST){
+ dprintf("> sg_i=%d .page=%p .offset=%u .length=%d\n",
+ sg_i, sg[sg_i].page, sg[sg_i].offset, sg[sg_i].length);
+ }
+ sg_i++;
+ if((len -= copy) == 0) goto exit;
+ offset += copy;
+ }
+ start = end;
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++){
+ BUG_TRAP(start <= offset + len);
+ size = skb_shinfo(skb)->frags[i].size;
+ end = start + size;
+ copy = end - offset;
+ if(copy > 0){
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ if(copy > len) copy = len;
+ if(sg_i >= *sg_n){
+ err = -EINVAL;
+ goto exit;
+ }
+ SET_SCATTER_ADDR(sg[sg_i], NULL);
+ sg[sg_i].page = frag->page;
+ sg[sg_i].offset = frag->page_offset + offset - start;
+ sg[sg_i].length = copy;
+ if(DEBUG_SCATTERLIST){
+ dprintf("> sg_i=%d .page=%p .offset=%u .length=%d\n",
+ sg_i, sg[sg_i].page, sg[sg_i].offset, sg[sg_i].length);
+ }
+ sg_i++;
+ if((len -= copy) == 0) goto exit;
+ offset += copy;
+ }
+ start = end;
+ }
+ exit:
+ if(!err) *sg_n = sg_i;
+ if(len) wprintf("> len=%d\n", len);
+ if(len) BUG();
+ if(err) dprintf("< err=%d sg_n=%d\n", err, *sg_n);
+ return err;
+}
+
+struct arpheader
+{
+ unsigned short ar_hrd; /* format of hardware address */
+ unsigned short ar_pro; /* format of protocol address */
+ unsigned char ar_hln; /* length of hardware address */
+ unsigned char ar_pln; /* length of protocol address */
+ unsigned short ar_op; /* ARP opcode (command) */
+
+#if 1
+ /*
+ * Ethernet looks like this : This bit is variable sized however...
+ */
+ unsigned char ar_sha[ETH_ALEN]; /* sender hardware address */
+ unsigned char ar_sip[4]; /* sender IP address */
+ unsigned char ar_tha[ETH_ALEN]; /* target hardware address */
+ unsigned char ar_tip[4]; /* target IP address */
+#endif
+
+};
+
+void print_skb_data(char *msg, int count, struct sk_buff *skb, u8 *data, int len)
+{
+ static int skb_count = 1000000;
+ u8 *ptr, *end;
+ u32 src_addr, dst_addr;
+ // Transport layer header.
+ union {
+ struct tcphdr *th;
+ struct udphdr *uh;
+ struct icmphdr *icmph;
+ struct igmphdr *igmph;
+ struct iphdr *ipiph;
+ unsigned char *raw;
+ } h;
+ // Network layer header.
+ union {
+ struct iphdr *iph;
+ struct ipv6hdr *ipv6h;
+ struct arpheader *arph;
+ struct ipxhdr *ipxh;
+ unsigned char *raw;
+ } nh;
+ // Link layer header.
+ union {
+ struct ethhdr *ethernet;
+ unsigned char *raw;
+ } mac;
+ int protocol;
+ if(!count) count = ++skb_count;
+ if(!msg) msg = (char *)__FUNCTION__;
+ if(!data){
+ printk("%s.%d> null data\n", msg, count);
+ return;
+ }
+ ptr = data;
+ end = data + len;
+ mac.raw = ptr;
+ ptr += sizeof(struct ethhdr);
+ if(ptr > end){ printk("***MAC:"); goto exit; }
+ protocol = ntohs(mac.ethernet->h_proto);
+ nh.raw = ptr;
+
+ printk("%s.%d> type=%d protocol=0x%x\n",
+ msg, count, skb->pkt_type, htons(skb->protocol));
+ if(1){
+ printk("%s.%d> %p mac src=" MACFMT " dst=" MACFMT "\n",
+ msg, count, data,
+ MAC6TUPLE(mac.ethernet->h_source),
+ MAC6TUPLE(mac.ethernet->h_dest));
+ }
+
+ switch(protocol){
+ case ETH_P_ARP:
+ ptr += sizeof(struct arpheader);
+ if(ptr > end){ printk("***ARP:"); goto exit; }
+ if(0){
+ printk("%s.%d> ARP hrd=%d, pro=%d, hln=%d, pln=%d, op=%d\n",
+ msg, count,
+ nh.arph->ar_hrd, nh.arph->ar_pro, nh.arph->ar_hln,
+ nh.arph->ar_pln, nh.arph->ar_op);
+ }
+ memcpy(&src_addr, nh.arph->ar_sip, 4);
+ src_addr = ntohl(src_addr);
+ memcpy(&dst_addr, nh.arph->ar_tip, 4);
+ dst_addr = ntohl(dst_addr);
+ printk("%s.%d> ARP HW src=" MACFMT " dst=" MACFMT "\n",
+ msg, count, MAC6TUPLE(nh.arph->ar_sha), MAC6TUPLE(nh.arph->ar_tha));
+ printk("%s.%d> ARP IP src=" IPFMT " dst=" IPFMT "\n",
+ msg, count, HIPQUAD(src_addr), HIPQUAD(dst_addr));
+ break;
+ case ETH_P_IP: {
+ u16 src_port, dst_port;
+ if(ptr + sizeof(struct iphdr) > end){ printk("***IP:"); goto exit; }
+ src_addr = ntohl(nh.iph->saddr);
+ dst_addr = ntohl(nh.iph->daddr);
+ if(1){
+ printk("%s.%d> IP proto=%d src=" IPFMT " dst=" IPFMT "\n",
+ msg, count, nh.iph->protocol,
+ HIPQUAD(src_addr), HIPQUAD(dst_addr));
+ printk("%s.%d> IP tot_len=%u len=%d\n",
+ msg, count, nh.iph->tot_len & 0xffff, len - ETH_HLEN);
+ }
+ ptr += (nh.iph->ihl * 4);
+ if(ptr > end){ printk ("***IP: len"); goto exit; }
+ h.raw = ptr;
+ switch(nh.iph->protocol){
+ case IPPROTO_TCP:
+ ptr += sizeof(struct tcphdr);
+ if(ptr > end){ printk("***TCP:"); goto exit; }
+ src_port = ntohs(h.th->source);
+ dst_port = ntohs(h.th->dest);
+ printk("%s.%d> TCP src=" IPFMT ":%u dst=" IPFMT ":%u\n",
+ msg, count,
+ HIPQUAD(src_addr), src_port,
+ HIPQUAD(dst_addr), dst_port);
+ break;
+ case IPPROTO_UDP:
+ ptr += sizeof(struct udphdr);
+ if(ptr > end){ printk("***UDP:"); goto exit; }
+ src_port = ntohs(h.uh->source);
+ dst_port = ntohs(h.uh->dest);
+ printk("%s.%d> UDP src=" IPFMT ":%u dst=" IPFMT ":%u\n",
+ msg, count,
+ HIPQUAD(src_addr), src_port,
+ HIPQUAD(dst_addr), dst_port);
+ break;
+ default:
+ printk("%s.%d> IP %d src=" IPFMT " dst=" IPFMT "\n",
+ msg, count,
+ nh.iph->protocol, HIPQUAD(src_addr), HIPQUAD(dst_addr));
+ break;
+ }
+ break; }
+ case ETH_P_IPV6:
+ printk("%s.%d> IPv6\n", msg, count);
+ break;
+ case ETH_P_IPX:
+ printk("%s.%d> IPX\n", msg, count);
+ break;
+ default:
+ printk("%s.%d> protocol=%d\n", msg, count, protocol);
+ break;
+ }
+ return;
+ exit:
+ printk("%s.%d> %s: skb problem\n", msg, count, __FUNCTION__);
+ printk("%s.%d> %s: data=%p end=%p(%d) ptr=%p(%d) eth=%d arp=%d ip=%d\n",
+ msg, count, __FUNCTION__,
+ data, end, end - data, ptr, ptr - data,
+ sizeof(struct ethhdr), sizeof(struct arphdr), sizeof(struct iphdr));
+ return;
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef _VNET_SKB_UTIL_H_
+#define _VNET_SKB_UTIL_H_
+
+struct sk_buff;
+struct scatterlist;
+
+extern int skb_make_room(struct sk_buff **pskb, struct sk_buff *skb, int head_n, int tail_n);
+
+extern int skb_put_bits(const struct sk_buff *skb, int offset, void *src, int len);
+
+extern int pskb_put(struct sk_buff *skb, int n);
+
+extern void skb_print_bits(struct sk_buff *skb, int offset, int n);
+
+extern void buf_print(char *buf, int n);
+
+extern void *skb_trim_tail(struct sk_buff *skb, int n);
+
+extern int skb_scatterlist(struct sk_buff *skb, struct scatterlist *sg,
+ int *sg_n, int offset, int len);
+
+extern void print_skb_data(char *msg, int count, struct sk_buff *skb, u8 *data, int len);
+
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+
+#include <tunnel.h>
+#include <vnet.h>
+#include <varp.h>
+#include "hash_table.h"
+
+#define MODULE_NAME "VNET"
+//#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+void Tunnel_print(Tunnel *tunnel){
+ if(tunnel){
+ printk("Tunnel<%p base=%p ref=%02d type=%s>\n",
+ tunnel,
+ tunnel->base,
+ atomic_read(&tunnel->refcount),
+ tunnel->type->name);
+ if(tunnel->base){
+ Tunnel_print(tunnel->base);
+ }
+ } else {
+ printk("Tunnel<%p base=%p ref=%02d type=%s>\n",
+ NULL, NULL, 0, "ip");
+ }
+}
+
+int Tunnel_create(TunnelType *type, u32 vnet, u32 addr, Tunnel *base, Tunnel **val){
+ int err = 0;
+ Tunnel *tunnel = NULL;
+ dprintf("> type=%s vnet=%d addr=" IPFMT " base=%s\n",
+ type->name, vnet, NIPQUAD(addr), (base ? base->type->name : "ip"));
+ if(!type || !type->open || !type->send || !type->close){
+ err = -EINVAL;
+ goto exit;
+ }
+ tunnel = kmalloc(sizeof(Tunnel), GFP_ATOMIC);
+ if(!tunnel){
+ err = -ENOMEM;
+ goto exit;
+ }
+ atomic_set(&tunnel->refcount, 1);
+ tunnel->key.vnet = vnet;
+ tunnel->key.addr = addr;
+ tunnel->type = type;
+ tunnel->data = NULL;
+ tunnel->send_stats = (TunnelStats){};
+ Tunnel_incref(base);
+ tunnel->base = base;
+ err = type->open(tunnel);
+ exit:
+ if(err && tunnel){
+ Tunnel_decref(tunnel);
+ tunnel = NULL;
+ }
+ *val = tunnel;
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+int Tunnel_open(TunnelType *type, u32 vnet, u32 addr, Tunnel *base, Tunnel **tunnel){
+ int err = 0;
+
+ dprintf(">\n");
+ err = Tunnel_create(type, vnet, addr, base, tunnel);
+ if(err) goto exit;
+ err = Tunnel_add(*tunnel);
+ exit:
+ if(err){
+ Tunnel_decref(*tunnel);
+ *tunnel = NULL;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+void TunnelStats_update(TunnelStats *stats, int len, int err){
+ dprintf(">len=%d err=%d\n", len, err);
+ if(err){
+ stats->dropped_bytes += len;
+ stats->dropped_packets++;
+ } else {
+ stats->bytes += len;
+ stats->packets++;
+ }
+ dprintf("<\n");
+}
+
+/** Table of tunnels, indexed by vnet and addr. */
+HashTable *tunnel_table = NULL;
+
+static inline Hashcode tunnel_table_key_hash_fn(void *k){
+ TunnelKey *key = k;
+ Hashcode h = 0;
+ h = hash_2ul(key->vnet, key->addr);
+ return h;
+}
+
+static int tunnel_table_key_equal_fn(void *k1, void *k2){
+ TunnelKey *key1 = k1;
+ TunnelKey *key2 = k2;
+ return (key1->vnet == key2->vnet)
+ && (key1->addr == key2->addr);
+}
+
+static void tunnel_table_entry_free_fn(HashTable *table, HTEntry *entry){
+ Tunnel *tunnel;
+ if(!entry) return;
+ tunnel = entry->value;
+ //dprintf(">\n"); Tunnel_print(tunnel);
+ Tunnel_decref(tunnel);
+ HTEntry_free(entry);
+}
+
+int Tunnel_init(void){
+ int err = 0;
+ dprintf(">\n");
+ tunnel_table = HashTable_new(0);
+ if(!tunnel_table){
+ err = -ENOMEM;
+ goto exit;
+ }
+ tunnel_table->entry_free_fn = tunnel_table_entry_free_fn;
+ tunnel_table->key_hash_fn = tunnel_table_key_hash_fn;
+ tunnel_table->key_equal_fn = tunnel_table_key_equal_fn;
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Lookup tunnel state by vnet and destination.
+ *
+ * @param vnet vnet
+ * @param addr destination address
+ * @return tunnel state or NULL
+ */
+Tunnel * Tunnel_lookup(u32 vnet, u32 addr){
+ Tunnel *tunnel = NULL;
+ TunnelKey key = {.vnet = vnet, .addr = addr };
+ dprintf(">\n");
+ tunnel = HashTable_get(tunnel_table, &key);
+ Tunnel_incref(tunnel);
+ dprintf("< tunnel=%p\n", tunnel);
+ return tunnel;
+}
+
+int Tunnel_add(Tunnel *tunnel){
+ int err = 0;
+ dprintf(">\n");
+ if(HashTable_add(tunnel_table, tunnel, tunnel)){
+ Tunnel_incref(tunnel);
+ } else {
+ err = -ENOMEM;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+int Tunnel_del(Tunnel *tunnel){
+ return HashTable_remove(tunnel_table, tunnel);
+}
+
+/** Do tunnel send processing on a packet.
+ *
+ * @param tunnel tunnel state
+ * @param skb packet
+ * @return 0 on success, error code otherwise
+ */
+int Tunnel_send(Tunnel *tunnel, struct sk_buff *skb){
+ int err = 0;
+ int len;
+ dprintf("> tunnel=%p skb=%p\n", tunnel, skb);
+ len = skb->len;
+ if(tunnel){
+ dprintf("> type=%s type->send...\n", tunnel->type->name);
+ err = tunnel->type->send(tunnel, skb);
+ // Must not refer to skb after sending - might have been freed.
+ TunnelStats_update(&tunnel->send_stats, len, err);
+ } else {
+ struct net_device *dev = NULL;
+ err = vnet_get_device(DEVICE, &dev);
+ if(err) goto exit;
+ skb->dev = dev;
+ err = skb_xmit(skb);
+ dev_put(dev);
+ }
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+int __init tunnel_module_init(void){
+ return Tunnel_init();
+}
+
+void __exit tunnel_module_exit(void){
+}
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef __VNET_TUNNEL_H__
+#define __VNET_TUNNEL_H__
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <asm/atomic.h>
+
+struct sk_buff;
+struct Tunnel;
+
+typedef struct TunnelType {
+ const char *name;
+ int (*open)(struct Tunnel *tunnel);
+ int (*send)(struct Tunnel *tunnel, struct sk_buff *skb);
+ void (*close)(struct Tunnel *tunnel);
+} TunnelType;
+
+typedef struct TunnelStats {
+ int bytes;
+ int packets;
+ int dropped_bytes;
+ int dropped_packets;
+} TunnelStats;
+
+typedef struct TunnelKey {
+ u32 vnet;
+ u32 addr;
+} TunnelKey;
+
+typedef struct Tunnel {
+ /** Key identifying the tunnel. Must be first. */
+ struct TunnelKey key;
+ /** Reference count. */
+ atomic_t refcount;
+ /** Tunnel type. */
+ struct TunnelType *type;
+ /** Statistics. */
+ struct TunnelStats send_stats;
+ /** Type-dependent state. */
+ void *data;
+ /** Underlying tunnel (may be null). */
+ struct Tunnel *base;
+} Tunnel;
+
+extern void Tunnel_print(Tunnel *tunnel);
+
+/** Decrement the reference count, freeing if zero.
+ *
+ * @param tunnel tunnel (may be null)
+ */
+static inline void Tunnel_decref(Tunnel *tunnel){
+ if(!tunnel) return;
+ if(atomic_dec_and_test(&tunnel->refcount)){
+ printk("%s> Closing tunnel:\n", __FUNCTION__);
+ Tunnel_print(tunnel);
+ tunnel->type->close(tunnel);
+ Tunnel_decref(tunnel->base);
+ kfree(tunnel);
+ }
+}
+
+/** Increment the reference count.
+ *
+ * @param tunnel tunnel (may be null)
+ */
+static inline void Tunnel_incref(Tunnel *tunnel){
+ if(!tunnel) return;
+ atomic_inc(&tunnel->refcount);
+}
+
+extern int Tunnel_init(void);
+extern Tunnel * Tunnel_lookup(u32 vnet, u32 addr);
+extern int Tunnel_add(Tunnel *tunnel);
+extern int Tunnel_del(Tunnel *tunnel);
+extern int Tunnel_send(Tunnel *tunnel, struct sk_buff *skb);
+
+extern int Tunnel_create(TunnelType *type, u32 vnet, u32 addr, Tunnel *base, Tunnel **tunnelp);
+extern int Tunnel_open(TunnelType *type, u32 vnet, u32 addr, Tunnel *base, Tunnel **tunnelp);
+
+extern int tunnel_module_init(void);
+extern void tunnel_module_exit(void);
+
+#endif /* !__VNET_TUNNEL_H__ */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/version.h>
+
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/udp.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <asm/semaphore.h>
+
+#include <tunnel.h>
+#include <vnet.h>
+#include <vif.h>
+#include <varp.h>
+#include <if_varp.h>
+
+#include "allocate.h"
+#include "hash_table.h"
+#include "sys_net.h"
+#include "sys_string.h"
+
+#define MODULE_NAME "VARP"
+//#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+// The 'ethernet' field in the skb->mac union went away.
+#define MAC_ETH(_skb) ((struct ethhdr *)(_skb)->mac.raw)
+#else
+#define MAC_ETH(_skb) ((_skb)->mac.ethernet)
+#endif
+
+/** @file VARP: Virtual ARP.
+ *
+ * Handles virtual ARP requests for vnet/vmac.
+ */
+
+/*
+
+Varp uses UDP on port 1798.
+
+on domain up: ?
+ send varp.announce { id, vmac, vnet, coa } for each vif
+ that haven't announced before, or has changed.
+ install vif entries in local table.
+
+on varp.announce{ id, vmac, vnet, coa }:
+ update VARP entry for vmac x vnet if have one, reset ttl.
+
+on varp.request { id, vmac, vnet }:
+ if have a vif for the requested vmac/vnet,
+ reply with varp.announce{ id, vmac, vnet, coa }
+
+on timer:
+ traverse VARP table, flush old entries.
+
+on probe timer:
+ probe again if not out of tries.
+ if out of tries invalidate entry.
+
+*/
+
+/** Time-to-live of varp entries (in jiffies).*/
+#define VARP_ENTRY_TTL (60*HZ)
+
+/** Maximum number of varp probes to make. */
+#define VARP_PROBE_MAX 5
+
+/** Interval between varp probes (in jiffies). */
+#define VARP_PROBE_INTERVAL (3*HZ)
+
+/** Maximum number of queued skbs for a varp entry. */
+#define VARP_QUEUE_MAX 16
+
+/** Number of buckets in the varp table (must be prime). */
+#define VARP_TABLE_BUCKETS 3001
+
+/** Varp entry states. */
+enum {
+ VARP_STATE_INCOMPLETE = 1,
+ VARP_STATE_REACHABLE = 2,
+ VARP_STATE_FAILED = 3
+};
+
+/** Varp entry flags. */
+enum {
+ VARP_FLAG_PROBING = 1,
+ VARP_FLAG_PERMANENT = 2,
+};
+
+/** Key for varp entries. */
+typedef struct VarpKey {
+ /** Vnet id (host order). */
+ u32 vnet;
+ /** Virtual MAC address. */
+ Vmac vmac;
+} VarpKey;
+
+/** An entry in the varp cache. */
+typedef struct VarpEntry {
+ /** Key for the entry. */
+ VarpKey key;
+ /** Care-of address for the key. */
+ u32 addr;
+ /** Last-updated timestamp. */
+ unsigned long timestamp;
+ /** State. */
+ short state;
+ /** Flags. */
+ short flags;
+ /** Reference count. */
+ atomic_t refcount;
+ /** Lock. */
+ rwlock_t lock;
+ /** How many probes have been made. */
+ atomic_t probes;
+ /** Probe timer. */
+ struct timer_list timer;
+ void (*error)(struct VarpEntry *ventry, struct sk_buff *skb);
+ /** Outbound skb queue. */
+ struct sk_buff_head queue;
+ /** Maximum size of the queue. */
+ int queue_max;
+
+ int locks;
+} VarpEntry;
+
+/** The varp cache. Varp entries indexed by VarpKey. */
+typedef struct VarpTable {
+
+ HashTable *table;
+
+ /** Sweep timer. */
+ struct timer_list timer;
+
+ /** Lock. Need to use a semaphore instead of a spinlock because
+ * some operations under the varp table lock can schedule - and
+ * you mustn't hold a spinlock when scheduling.
+ */
+ struct semaphore lock;
+
+} VarpTable;
+
+/** The varp cache. */
+static VarpTable *varp_table = NULL;
+
+/** Module parameter for the multicast address. */
+static char *varp_mcaddr = NULL;
+
+/** Multicast address (network order). */
+u32 varp_mcast_addr = 0;
+
+/** Unicast address (network order). */
+u32 varp_ucast_addr = 0;
+
+/** UDP port (network order). */
+u16 varp_port = 0;
+
+/** Network device to use. */
+char *varp_device = DEVICE;
+
+#define VarpTable_read_lock(z, flags) do{ (flags) = 0; down(&(z)->lock); } while(0)
+#define VarpTable_read_unlock(z, flags) do{ (flags) = 0; up(&(z)->lock); } while(0)
+#define VarpTable_write_lock(z, flags) do{ (flags) = 0; down(&(z)->lock); } while(0)
+#define VarpTable_write_unlock(z, flags) do{ (flags) = 0; up(&(z)->lock); } while(0)
+
+#define VarpEntry_lock(ventry, flags) write_lock_irqsave(&(ventry)->lock, (flags))
+#define VarpEntry_unlock(ventry, flags) write_unlock_irqrestore(&(ventry)->lock, (flags))
+
+void VarpTable_sweep(VarpTable *z, int all);
+void VarpTable_print(VarpTable *z);
+
+/** Print the varp cache (if debug on).
+ */
+void varp_dprint(void){
+#ifdef DEBUG
+ VarpTable_print(varp_table);
+#endif
+}
+
+/** Print varp info and the varp cache.
+ */
+void varp_print(void){
+ printk(KERN_INFO "=== VARP ===============================================================\n");
+ printk(KERN_INFO "varp_device %s\n", varp_device);
+ printk(KERN_INFO "varp_mcast_addr " IPFMT "\n", NIPQUAD(varp_mcast_addr));
+ printk(KERN_INFO "varp_ucast_addr " IPFMT "\n", NIPQUAD(varp_ucast_addr));
+ printk(KERN_INFO "varp_port %d\n", ntohs(varp_port));
+ VarpTable_print(varp_table);
+ printk(KERN_INFO "========================================================================\n");
+}
+
+/** Lookup a network device by name.
+ *
+ * @param name device name
+ * @param dev return parameter for the device
+ * @return 0 on success, error code otherwise
+ */
+int vnet_get_device(const char *name, struct net_device **dev){
+ int err = 0;
+ *dev = dev_get_by_name(name);
+ if(!*dev){
+ err = -ENETDOWN;
+ }
+ return err;
+}
+
+/** Get the source address from a device.
+ *
+ * @param dev device
+ * @param addr return parameter for address
+ * @return 0 on success, error code otherwise
+ */
+int vnet_get_device_address(struct net_device *dev, u32 *addr){
+ int err = 0;
+ struct in_device *in_dev;
+
+ //printk("%s>\n", __FUNCTION__);
+ in_dev = in_dev_get(dev);
+ if(!in_dev){
+ err = -EIO;
+ goto exit;
+ }
+ *addr = in_dev->ifa_list->ifa_address;
+ in_dev_put(in_dev);
+ exit:
+ //printk("%s< err=%d\n", __FUNCTION__, err);
+ return err;
+}
+
+#ifndef LL_RESERVED_SPACE
+#define HH_DATA_MOD 16
+#define LL_RESERVED_SPACE(dev) \
+ ((dev->hard_header_len & ~(HH_DATA_MOD - 1)) + HH_DATA_MOD)
+#endif
+
+/** Send a varp protocol message.
+ *
+ * @param opcode varp opcode (host order)
+ * @param dev device (may be null)
+ * @param skb skb being replied to (may be null)
+ * @param vnet vnet id (in host order)
+ * @param vmac vmac (in network order)
+ * @return 0 on success, error code otherwise
+ */
+int varp_send(u16 opcode, struct net_device *dev, struct sk_buff *skbin,
+ u32 vnet, Vmac *vmac){
+ int err = 0;
+ int link_n = 0;
+ int ip_n = sizeof(struct iphdr);
+ int udp_n = sizeof(struct udphdr);
+ int varp_n = sizeof(VarpHdr);
+ struct sk_buff *skbout = NULL;
+ struct in_device *in_dev = NULL;
+ VarpHdr *varph = NULL;
+ u8 macbuf[6] = {};
+ u8 *smac, *dmac;
+ u32 saddr, daddr;
+ u16 sport, dport;
+
+ dmac = macbuf;
+ dprintf("> opcode=%d vnet=%d vmac=" MACFMT "\n",
+ opcode, ntohl(vnet), MAC6TUPLE(vmac->mac));
+ if(!dev){
+ //todo: should use routing for daddr to get device.
+ err = vnet_get_device(varp_device, &dev);
+ if(err) goto exit;
+ }
+ link_n = LL_RESERVED_SPACE(dev);
+ in_dev = in_dev_get(dev);
+ if(!in_dev) goto exit;
+
+ smac = dev->dev_addr;
+ saddr = in_dev->ifa_list->ifa_address;
+
+ if(skbin){
+ dmac = MAC_ETH(skbin)->h_source;
+ sport = skbin->h.uh->dest;
+ daddr = skbin->nh.iph->saddr;
+ //dport = skbin->h.uh->source;
+ dport = varp_port;
+ } else {
+ if(!in_dev) goto exit;
+ if(MULTICAST(varp_mcast_addr)){
+ daddr = varp_mcast_addr;
+ ip_eth_mc_map(daddr, dmac);
+ } else {
+ daddr = in_dev->ifa_list->ifa_broadcast;
+ dmac = dev->broadcast;
+ }
+ sport = varp_port;
+ dport = varp_port;
+ }
+ in_dev_put(in_dev);
+
+ dprintf("> smac=" MACFMT " dmac=" MACFMT "\n", MAC6TUPLE(smac), MAC6TUPLE(dmac));
+ dprintf("> saddr=" IPFMT " daddr=" IPFMT "\n", NIPQUAD(saddr), NIPQUAD(daddr));
+ dprintf("> sport=%u dport=%u\n", ntohs(sport), ntohs(dport));
+
+ skbout = alloc_skb(link_n + ip_n + udp_n + varp_n, GFP_ATOMIC);
+ if (!skbout){
+ err = -ENOMEM;
+ goto exit;
+ }
+ skbout->dev = dev;
+ skb_reserve(skbout, link_n);
+ skbout->protocol = htons(ETH_P_IP);
+
+ // Device header. Pushes device header on front of skb.
+ if (dev->hard_header){
+ err = dev->hard_header(skbout, dev, ETH_P_IP, dmac, smac, skbout->len);
+ if(err < 0) goto exit;
+ skbout->mac.raw = skbout->data;
+ }
+
+ // IP header.
+ skbout->nh.raw = skb_put(skbout, ip_n);
+ skbout->nh.iph->version = 4;
+ skbout->nh.iph->ihl = ip_n / 4;
+ skbout->nh.iph->tos = 0;
+ skbout->nh.iph->tot_len = htons(ip_n + udp_n + varp_n);
+ skbout->nh.iph->id = 0;
+ skbout->nh.iph->frag_off = 0;
+ skbout->nh.iph->ttl = 64;
+ skbout->nh.iph->protocol = IPPROTO_UDP;
+ skbout->nh.iph->saddr = saddr;
+ skbout->nh.iph->daddr = daddr;
+ skbout->nh.iph->check = 0;
+
+ // UDP header.
+ skbout->h.raw = skb_put(skbout, udp_n);
+ skbout->h.uh->source = sport;
+ skbout->h.uh->dest = dport;
+ skbout->h.uh->len = htons(udp_n + varp_n);
+ skbout->h.uh->check = 0;
+
+ // Varp header.
+ varph = (void*)skb_put(skbout, varp_n);
+ *varph = (VarpHdr){};
+ varph->id = htons(VARP_ID);
+ varph->opcode = htons(opcode);
+ varph->vnet = htonl(vnet);
+ varph->vmac = *vmac;
+ varph->addr = saddr;
+
+ err = skb_xmit(skbout);
+
+ exit:
+ if(err && skbout) kfree_skb(skbout);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Send a varp request for the vnet and destination mac of a packet.
+ *
+ * @param skb packet
+ * @param vnet vnet (in host order)
+ * @return 0 on success, error code otherwise
+ */
+int varp_solicit(struct sk_buff *skb, int vnet){
+ int err = 0;
+ dprintf("> skb=%p\n", skb);
+ varp_dprint();
+ err = varp_send(VARP_OP_REQUEST, NULL, NULL,
+ vnet, (Vmac*)MAC_ETH(skb)->h_dest);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/* Test some flags.
+ *
+ * @param z varp entry
+ * @param flags to test
+ * @return nonzero if flags set
+ */
+int VarpEntry_get_flags(VarpEntry *z, int flags){
+ return z->flags & flags;
+}
+
+/** Set some flags.
+ *
+ * @param z varp entry
+ * @param flags to set
+ * @param set set flags on if nonzero, off if zero
+ * @return new flags value
+ */
+int VarpEntry_set_flags(VarpEntry *z, int flags, int set){
+ if(set){
+ z->flags |= flags;
+ } else {
+ z->flags &= ~flags;
+ }
+ return z->flags;
+}
+
+/** Print a varp entry.
+ *
+ * @param ventry varp entry
+ */
+void VarpEntry_print(VarpEntry *ventry){
+ if(ventry){
+ char *c, *d;
+ switch(ventry->state){
+ case VARP_STATE_INCOMPLETE: c = "INC"; break;
+ case VARP_STATE_REACHABLE: c = "RCH"; break;
+ case VARP_STATE_FAILED: c = "FLD"; break;
+ default: c = "UNK"; break;
+ }
+ d = (VarpEntry_get_flags(ventry, VARP_FLAG_PROBING) ? "P" : " ");
+
+ printk(KERN_INFO "VENTRY(%p ref=%1d %s %s vnet=%d vmac=" MACFMT " addr=" IPFMT " q=%d t=%lu)\n",
+ ventry,
+ atomic_read(&ventry->refcount),
+ c, d,
+ ventry->key.vnet,
+ MAC6TUPLE(ventry->key.vmac.mac),
+ NIPQUAD(ventry->addr),
+ skb_queue_len(&ventry->queue),
+ ventry->timestamp);
+ } else {
+ printk("VENTRY: Null!\n");
+ }
+}
+
+/** Free a varp entry.
+ *
+ * @param z varp entry
+ */
+void VarpEntry_free(VarpEntry *z){
+ if(!z) return;
+ deallocate(z);
+}
+
+/** Increment reference count.
+ *
+ * @param z varp entry (may be null)
+ */
+void VarpEntry_incref(VarpEntry *z){
+ if(!z) return;
+ atomic_inc(&z->refcount);
+ //dprintf("> "); VarpEntry_print(z);
+}
+
+/** Decrement reference count, freeing if zero.
+ *
+ * @param z varp entry (may be null)
+ */
+void VarpEntry_decref(VarpEntry *z){
+ if(!z) return;
+ //dprintf("> "); VarpEntry_print(z);
+ if(atomic_dec_and_test(&z->refcount)){
+ //dprintf("> freeing %p...\n", z);
+ VarpEntry_free(z);
+ }
+}
+
+/** Call the error handler.
+ *
+ * @param ventry varp entry
+ */
+void VarpEntry_error(VarpEntry *ventry){
+ struct sk_buff *skb;
+ skb = skb_peek(&ventry->queue);
+ if(!skb) return;
+ if(ventry->error) ventry->error(ventry, skb);
+ skb_queue_purge(&ventry->queue);
+}
+
+/** Schedule the varp entry timer.
+ * Must increment the reference count before doing
+ * this the first time, so the ventry won' be freed
+ * before the timer goes off.
+ *
+ * @param ventry varp entry
+ */
+void VarpEntry_schedule(VarpEntry *ventry){
+ unsigned long now = jiffies;
+ ventry->timer.expires = now + VARP_PROBE_INTERVAL;
+ add_timer(&ventry->timer);
+}
+
+/** Function called when a varp entry timer goes off.
+ * If the entry is still incomplete, carries on probing.
+ * Otherwise stops probing.
+ *
+ * @param arg ventry
+ */
+static void varp_timer_fn(unsigned long arg){
+ unsigned long flags;
+ VarpEntry *ventry = (VarpEntry *)arg;
+ struct sk_buff *skb = NULL;
+ int locked = 0, probing = 0;
+
+ dprintf(">\n"); //VarpEntry_print(ventry);
+ VarpEntry_lock(ventry, flags);
+ locked = 1;
+ if(ventry->state == VARP_STATE_REACHABLE){
+ // Do nothing.
+ } else {
+ // Probe if haven't run out of tries, otherwise fail.
+ if(atomic_read(&ventry->probes) < VARP_PROBE_MAX){
+ probing = 1;
+ VarpEntry_schedule(ventry);
+ skb = skb_peek(&ventry->queue);
+ if(skb){
+ dprintf("> skbs in queue - solicit\n");
+ atomic_inc(&ventry->probes);
+ VarpEntry_unlock(ventry, flags);
+ locked = 0;
+ varp_solicit(skb, ventry->key.vnet);
+ } else {
+ dprintf("> empty queue.\n");
+ }
+ } else {
+ dprintf("> Out of probes: FAILED\n");
+ VarpEntry_error(ventry);
+ ventry->state = VARP_STATE_FAILED;
+ }
+ }
+ VarpEntry_set_flags(ventry, VARP_FLAG_PROBING, probing);
+ if(locked) VarpEntry_unlock(ventry, flags);
+ if(!probing) VarpEntry_decref(ventry);
+ dprintf("<\n");
+}
+
+/** Default error function for varp entries.
+ *
+ * @param ventry varp entry
+ * @param skb packet dropped because of error
+ */
+static void varp_error_fn(VarpEntry *ventry, struct sk_buff *skb){
+}
+
+/** Create a varp entry. Initializes the internal state.
+ *
+ * @param vnet vnet id
+ * @param vmac virtual MAC address (copied)
+ * @return ventry or null
+ */
+VarpEntry * VarpEntry_new(u32 vnet, Vmac *vmac){
+ VarpEntry *z = ALLOCATE(VarpEntry);
+ if(z){
+ unsigned long now = jiffies;
+
+ atomic_set(&z->refcount, 1);
+ z->lock = RW_LOCK_UNLOCKED;
+ z->state = VARP_STATE_INCOMPLETE;
+ z->queue_max = VARP_QUEUE_MAX;
+ skb_queue_head_init(&z->queue);
+ init_timer(&z->timer);
+ z->timer.data = (unsigned long)z;
+ z->timer.function = varp_timer_fn;
+ z->timestamp = now;
+ z->error = varp_error_fn;
+
+ z->key.vnet = vnet;
+ z->key.vmac = *vmac;
+ }
+ return z;
+}
+
+/** Hash function for keys in the varp cache.
+ * Hashes the vnet id and mac.
+ *
+ * @param k key (VarpKey)
+ * @return hashcode
+ */
+Hashcode varp_key_hash_fn(void *k){
+ VarpKey *key = k;
+ Hashcode h;
+ h = hash_2ul(key->vnet,
+ (key->vmac.mac[0] << 24) |
+ (key->vmac.mac[1] << 16) |
+ (key->vmac.mac[2] << 8) |
+ (key->vmac.mac[3] ));
+ h = hash_hul(h,
+ (key->vmac.mac[4] << 8) |
+ (key->vmac.mac[5] ));
+ return h;
+}
+
+/** Test equality for keys in the varp cache.
+ * Compares vnet and mac.
+ *
+ * @param k1 key to compare (VarpKey)
+ * @param k2 key to compare (VarpKey)
+ * @return 1 if equal, 0 otherwise
+ */
+int varp_key_equal_fn(void *k1, void *k2){
+ VarpKey *key1 = k1;
+ VarpKey *key2 = k2;
+ return (key1->vnet == key2->vnet)
+ && (memcmp(key1->vmac.mac, key2->vmac.mac, ETH_ALEN) == 0);
+}
+
+/** Free an entry in the varp cache.
+ *
+ * @param table containing table
+ * @param entry entry to free
+ */
+static void varp_entry_free_fn(HashTable *table, HTEntry *entry){
+ VarpEntry *ventry;
+ if(!entry) return;
+ ventry = entry->value;
+ if(ventry) VarpEntry_decref(ventry);
+ HTEntry_free(entry);
+}
+
+/** Free the whole varp cache.
+ * Dangerous.
+ *
+ * @param z varp cache
+ */
+void VarpTable_free(VarpTable *z){
+ unsigned long flags;
+ if(!z) return;
+ VarpTable_write_lock(z, flags);
+ del_timer(&z->timer);
+ z->timer.data = 0;
+ if(z->table) HashTable_free(z->table);
+ VarpTable_write_unlock(z, flags);
+ deallocate(z);
+}
+
+/** Schedule the varp table timer.
+ *
+ * @param z varp table
+ */
+void VarpTable_schedule(VarpTable *z){
+ unsigned long now = jiffies;
+ z->timer.expires = now + VARP_ENTRY_TTL;
+ add_timer(&z->timer);
+}
+
+/** Function called when the varp table timer goes off.
+ * Sweeps old varp cache entries and reschedules itself.
+ *
+ * @param arg varp table
+ */
+static void varp_table_timer_fn(unsigned long arg){
+ VarpTable *z = (VarpTable *)arg;
+ //dprintf("> z=%p\n", z);
+ if(z){
+ VarpTable_sweep(z, 0);
+ VarpTable_schedule(z);
+ }
+ //dprintf("<\n");
+}
+
+/** Print a varp table.
+ *
+ * @param z table
+ */
+void VarpTable_print(VarpTable *z){
+ HashTable_for_decl(entry);
+ VarpEntry *ventry;
+ unsigned long flags, vflags;
+
+ //dprintf(">\n");
+ VarpTable_read_lock(z, flags);
+ HashTable_for_each(entry, varp_table->table){
+ ventry = entry->value;
+ VarpEntry_lock(ventry, vflags);
+ VarpEntry_print(ventry);
+ VarpEntry_unlock(ventry, vflags);
+ }
+ VarpTable_read_unlock(z, flags);
+ //dprintf("<\n");
+}
+
+/** Create a varp table.
+ *
+ * @return new table or null
+ */
+VarpTable * VarpTable_new(void){
+ int err = -ENOMEM;
+ VarpTable *z = NULL;
+
+ z = ALLOCATE(VarpTable);
+ if(!z) goto exit;
+ z->table = HashTable_new(VARP_TABLE_BUCKETS);
+ if(!z->table) goto exit;
+ z->table->key_equal_fn = varp_key_equal_fn;
+ z->table->key_hash_fn = varp_key_hash_fn;
+ z->table->entry_free_fn = varp_entry_free_fn;
+ init_MUTEX(&z->lock);
+ init_timer(&z->timer);
+ z->timer.data = (unsigned long)z;
+ z->timer.function = varp_table_timer_fn;
+ VarpTable_schedule(z);
+ err = 0;
+ exit:
+ if(err){
+ VarpTable_free(z);
+ z = NULL;
+ }
+ return z;
+}
+
+/** Add a new entry to the varp table.
+ *
+ * @param z table
+ * @param vnet vnet id
+ * @param vmac virtual MAC address (copied)
+ * @return new entry or null
+ */
+VarpEntry * VarpTable_add(VarpTable *z, u32 vnet, Vmac *vmac){
+ int err = -ENOMEM;
+ VarpEntry *ventry;
+ HTEntry *entry;
+ unsigned long flags;
+
+ ventry = VarpEntry_new(vnet, vmac);
+ if(!ventry) goto exit;
+ //dprintf("> "); VarpEntry_print(ventry);
+ VarpTable_write_lock(z, flags);
+ entry = HashTable_add(z->table, ventry, ventry);
+ VarpTable_write_unlock(z, flags);
+ if(!entry) goto exit;
+ VarpEntry_incref(ventry);
+ err = 0;
+ exit:
+ if(err){
+ VarpEntry_free(ventry);
+ ventry = NULL;
+ }
+ return ventry;
+}
+
+/** Remove an entry from the varp table.
+ *
+ * @param z table
+ * @param ventry entry to remove
+ * @return removed count
+ */
+int VarpTable_remove(VarpTable *z, VarpEntry *ventry){
+ return HashTable_remove(z->table, ventry);
+}
+
+/** Lookup an entry in the varp table.
+ *
+ * @param z table
+ * @param vnet vnet id
+ * @param vmac virtual MAC addres
+ * @return entry found or null
+ */
+VarpEntry * VarpTable_lookup(VarpTable *z, u32 vnet, Vmac *vmac){
+ unsigned long flags;
+ VarpKey key = { .vnet = vnet, .vmac = *vmac };
+ VarpEntry *ventry;
+ VarpTable_read_lock(z, flags);
+ ventry = HashTable_get(z->table, &key);
+ VarpTable_read_unlock(z, flags);
+ if(ventry) VarpEntry_incref(ventry);
+ return ventry;
+}
+
+/** Handle output for a reachable ventry.
+ * Send the skb using the tunnel to the care-of address.
+ *
+ * @param ventry varp entry
+ * @param skb skb to send
+ * @return 0 on success, error code otherwise
+ */
+int VarpEntry_send(VarpEntry *ventry, struct sk_buff *skb){
+ int err = 0;
+ unsigned long flags = 0;
+ u32 addr;
+
+ dprintf("> skb=%p\n", skb);
+ addr = ventry->addr;
+ VarpEntry_unlock(ventry, flags);
+ err = vnet_tunnel_send(ventry->key.vnet, addr, skb);
+ VarpEntry_lock(ventry, flags);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Handle output for a non-reachable ventry. Send messages to complete it.
+ * If the entry is still incomplete, queue the skb, otherwise
+ * send it. If the queue is full, dequeue and free an old skb to
+ * make room for the new one.
+ *
+ * @param ventry varp entry
+ * @param skb skb to send
+ * @return 0 on success, error code otherwise
+ */
+int VarpEntry_resolve(VarpEntry *ventry, struct sk_buff *skb){
+ int err = 0;
+ unsigned long flags = 0;
+
+ dprintf("> skb=%p\n", skb); //VarpEntry_print(ventry);
+ ventry->state = VARP_STATE_INCOMPLETE;
+ atomic_set(&ventry->probes, 1);
+ if(!VarpEntry_get_flags(ventry, VARP_FLAG_PROBING)){
+ VarpEntry_set_flags(ventry, VARP_FLAG_PROBING, 1);
+ VarpEntry_incref(ventry);
+ VarpEntry_schedule(ventry);
+ }
+ VarpEntry_unlock(ventry, flags);
+ varp_solicit(skb, ventry->key.vnet);
+ VarpEntry_lock(ventry, flags);
+
+ if(ventry->state == VARP_STATE_INCOMPLETE){
+ if(skb_queue_len(&ventry->queue) >= ventry->queue_max){
+ struct sk_buff *oldskb;
+ oldskb = ventry->queue.next;
+ __skb_unlink(oldskb, &ventry->queue);
+ dprintf("> purging skb=%p\n", oldskb);
+ kfree_skb(oldskb);
+ }
+ __skb_queue_tail(&ventry->queue, skb);
+ } else {
+ err = VarpEntry_send(ventry, skb);
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Handle output for a ventry. Resolves the ventry
+ * if necessary.
+ *
+ * @param ventry varp entry
+ * @param skb skb to send
+ * @return 0 on success, error code otherwise
+ */
+int VarpEntry_output(VarpEntry *ventry, struct sk_buff *skb){
+ int err = 0;
+
+ switch(ventry->state){
+ case VARP_STATE_REACHABLE:
+ err = VarpEntry_send(ventry, skb);
+ break;
+ default:
+ err = VarpEntry_resolve(ventry, skb);
+ break;
+ }
+ return err;
+}
+
+/** Process the output queue for a ventry. Sends the queued skbs if
+ * the ventry is reachable, otherwise drops them.
+ *
+ * @param ventry varp entry
+ */
+void VarpEntry_process_queue(VarpEntry *ventry){
+ struct sk_buff *skb;
+ for( ; ; ){
+ if(ventry->state != VARP_STATE_REACHABLE) break;
+ skb = __skb_dequeue(&ventry->queue);
+ if(!skb) break;
+ VarpEntry_output(ventry, skb);
+ }
+ skb_queue_purge(&ventry->queue);
+}
+
+/** Update a ventry. Sets the address and state to those given
+ * and sets the timestamp to 'now'.
+ *
+ * @param ventry varp entry
+ * @param addr care-of address
+ * @param state state
+ * @return 0 on success, error code otherwise
+ */
+int VarpEntry_update(VarpEntry *ventry, u32 addr, int state){
+ int err = 0;
+ unsigned long now = jiffies;
+ unsigned long flags;
+
+ dprintf("> addr=" IPFMT " state=%d\n", NIPQUAD(addr), state);
+ //VarpEntry_print(ventry);
+ VarpEntry_lock(ventry, flags);
+ if(VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT)) goto exit;
+ ventry->addr = addr;
+ ventry->timestamp = now;
+ ventry->state = state;
+ VarpEntry_process_queue(ventry);
+ exit:
+ //dprintf("> "); VarpEntry_print(ventry);
+ VarpEntry_unlock(ventry, flags);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+int VarpTable_update(VarpTable *z, int vnet, Vmac *vmac, u32 addr,
+ int state, int force){
+ int err = 0;
+ VarpEntry *ventry;
+
+ dprintf("> vnet=%d mac=" MACFMT " addr=" IPFMT " state=%d force=%d\n",
+ vnet, MAC6TUPLE(vmac->mac), NIPQUAD(addr), state, force);
+ ventry = VarpTable_lookup(z, vnet, vmac);
+ if(force && !ventry){
+ dprintf("> No entry, adding\n");
+ ventry = VarpTable_add(z, vnet, vmac);
+ }
+ if(ventry){
+ dprintf("> Updating\n");
+ err = VarpEntry_update(ventry, addr, state);
+ VarpEntry_decref(ventry);
+ } else {
+ dprintf("> No entry found\n");
+ err = -ENOENT;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Update the ventry corresponding to the given varp header.
+ *
+ * @param z table
+ * @param varph varp header
+ * @param state state
+ * @return 0 on success, -ENOENT if no entry found
+ */
+int VarpTable_update_entry(VarpTable *z, VarpHdr *varph, int state){
+ return VarpTable_update(z, ntohl(varph->vnet), &varph->vmac, varph->addr, state, 0);
+}
+
+int varp_update(int vnet, unsigned char *vmac, u32 addr){
+ if(!varp_table){
+ return -ENOSYS;
+ }
+ return VarpTable_update(varp_table, vnet, (Vmac*)vmac, addr,
+ VARP_STATE_REACHABLE, 1);
+}
+
+/** Put old varp entries into the incomplete state.
+ * Permanent entries are not changed.
+ * If 'all' is non-zero, all non-permanent entries
+ * are put into the incomplete state, regardless of age.
+ *
+ * @param z table
+ * @param all reset all entries if non-zero
+ */
+void VarpTable_sweep(VarpTable *z, int all){
+ HashTable_for_decl(entry);
+ VarpEntry *ventry;
+ unsigned long now = jiffies;
+ unsigned long old = now - VARP_ENTRY_TTL;
+ unsigned long flags, vflags;
+
+ //dprintf(">\n");
+ VarpTable_read_lock(z, flags);
+ HashTable_for_each(entry, varp_table->table){
+ ventry = entry->value;
+ VarpEntry_lock(ventry, vflags);
+ if(!VarpEntry_get_flags(ventry, VARP_FLAG_PERMANENT) &&
+ (all || (ventry->timestamp < old))){
+ VarpEntry_process_queue(ventry);
+ ventry->state = VARP_STATE_INCOMPLETE;
+ }
+ VarpEntry_unlock(ventry, vflags);
+ }
+ VarpTable_read_unlock(z, flags);
+ //dprintf("<\n");
+}
+
+/** Handle a varp request. Look for a vif with the requested
+ * vnet and vmac. If find one, reply with the vnet, vmac and our
+ * address. Otherwise do nothing.
+ *
+ * @param skb incoming message
+ * @param varph varp message
+ * @return 0 if ok, -ENOENT if no matching vif, or error code
+ */
+int varp_handle_request(struct sk_buff *skb, VarpHdr *varph){
+ int err = -ENOENT;
+ u32 vnet;
+ Vmac *vmac;
+ Vif *vif = NULL;
+
+ dprintf(">\n");
+ vnet = ntohl(varph->vnet);
+ vmac = &varph->vmac;
+ dprintf("> vnet=%d vmac=" MACFMT "\n", vnet, MAC6TUPLE(vmac->mac));
+ if(vif_lookup(vnet, vmac, &vif)) goto exit;
+ varp_send(VARP_OP_ANNOUNCE, skb->dev, skb, vnet, vmac);
+ vif_decref(vif);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Announce the vnet and vmac of a vif (gratuitous varp).
+ *
+ * @param dev device to send on (may be null)
+ * @param vif vif
+ * @return 0 on success, error code otherwise
+ */
+int varp_announce_vif(struct net_device *dev, Vif *vif){
+ int err = 0;
+ dprintf(">\n");
+ if(!varp_table){
+ err = -ENOSYS;
+ goto exit;
+ }
+ err = varp_send(VARP_OP_ANNOUNCE, dev, NULL, vif->vnet, &vif->vmac);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Handle a varp announce message.
+ * Update the matching ventry if we have one.
+ *
+ * @param skb incoming message
+ * @param varp message
+ * @return 0 if OK, -ENOENT if no matching entry
+ */
+int varp_handle_announce(struct sk_buff *skb, VarpHdr *varph){
+ int err = 0;
+
+ dprintf(">\n");
+ err = VarpTable_update_entry(varp_table, varph, VARP_STATE_REACHABLE);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Handle an incoming varp message.
+ *
+ * @param skb incoming message
+ * @return 0 if OK, error code otherwise
+ */
+int varp_handle_message(struct sk_buff *skb){
+ // Assume h. nh set, skb->data point after udp hdr (at varphdr).
+ int err = -EINVAL, mine = 0;
+ VarpHdr *varph = (void*)(skb->h.uh + 1);
+
+ dprintf(">\n");
+ if(!varp_table){
+ err = -ENOSYS;
+ goto exit;
+ }
+ if(MULTICAST(skb->nh.iph->daddr) &&
+ (skb->nh.iph->daddr != varp_mcast_addr)){
+ // Ignore multicast packets not addressed to us.
+ err = 0;
+ dprintf("> daddr=" IPFMT " mcaddr=" IPFMT "\n",
+ NIPQUAD(skb->nh.iph->daddr), NIPQUAD(varp_mcast_addr));
+ goto exit;
+ }
+ if(skb->len < sizeof(*varph)){
+ wprintf("> Varp msg too short: %d < %d\n", skb->len, sizeof(*varph));
+ goto exit;
+ }
+ mine = 1;
+ if(varph->id != htons(VARP_ID)){
+ // It's not varp at all - ignore it.
+ wprintf("> Unknown id: %d \n", ntohs(varph->id));
+ goto exit;
+ }
+ if(1){
+ dprintf("> saddr=" IPFMT " daddr=" IPFMT "\n",
+ NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr));
+ dprintf("> sport=%u dport=%u\n", ntohs(skb->h.uh->source), ntohs(skb->h.uh->dest));
+ dprintf("> opcode=%d vnet=%u vmac=" MACFMT " addr=" IPFMT "\n",
+ ntohs(varph->opcode),
+ ntohl(varph->vnet),
+ MAC6TUPLE(varph->vmac.mac),
+ NIPQUAD(varph->addr));
+ varp_dprint();
+ }
+ switch(ntohs(varph->opcode)){
+ case VARP_OP_REQUEST:
+ err = varp_handle_request(skb, varph);
+ break;
+ case VARP_OP_ANNOUNCE:
+ err = varp_handle_announce(skb, varph);
+ break;
+ default:
+ wprintf("> Unknown opcode: %d \n", ntohs(varph->opcode));
+ break;
+ }
+ exit:
+ if(mine) err = 1;
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Send an outgoing packet on the appropriate vnet tunnel.
+ *
+ * @param skb outgoing message
+ * @param vnet vnet (host order)
+ * @return 0 on success, error code otherwise
+ */
+int varp_output(struct sk_buff *skb, u32 vnet){
+ int err = 0;
+ unsigned char *mac = NULL;
+ Vmac *vmac = NULL;
+ VarpEntry *ventry = NULL;
+
+ dprintf("> skb=%p vnet=%u\n", skb, vnet);
+ if(!varp_table){
+ err = -ENOSYS;
+ goto exit;
+ }
+ dprintf("> skb.mac=%p\n", skb->mac.raw);
+ if(!skb->mac.raw){
+ wprintf("> No ethhdr in skb!\n");
+ err = -EINVAL;
+ goto exit;
+ }
+ mac = MAC_ETH(skb)->h_dest;
+ vmac = (Vmac*)mac;
+ if(mac_is_multicast(mac)){
+ err = vnet_tunnel_send(vnet, varp_mcast_addr, skb);
+ } else {
+ ventry = VarpTable_lookup(varp_table, vnet, vmac);
+ if(!ventry){
+ ventry = VarpTable_add(varp_table, vnet, vmac);
+ }
+ if(ventry){
+ unsigned long flags;
+ VarpEntry_lock(ventry, flags);
+ err = VarpEntry_output(ventry, skb);
+ VarpEntry_unlock(ventry, flags);
+ VarpEntry_decref(ventry);
+ } else {
+ err = -ENOMEM;
+ }
+ }
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Set the varp multicast address (after initialization).
+ *
+ * @param addr address (network order)
+ * @return 0 on success, error code otherwise
+ */
+int varp_set_mcast_addr(uint32_t addr){
+ int err = 0;
+ varp_close();
+ varp_mcast_addr = addr;
+ err = varp_open(varp_mcast_addr, varp_ucast_addr, varp_port);
+ return err;
+}
+
+/** Initialize the varp multicast address from a module parameter.
+ *
+ * @param s address in IPv4 notation
+ * @return 0 on success, error code otherwise
+ */
+static void varp_init_mcast_addr(char *s){
+ unsigned long v = 0;
+
+ dprintf("> %s\n", s);
+ if(s && (get_inet_addr(s, &v) >= 0)){
+ varp_mcast_addr = (u32)v;
+ } else {
+ varp_mcast_addr = htonl(VARP_MCAST_ADDR);
+ }
+}
+
+/** Initialize the varp cache.
+ *
+ * @return 0 on success, error code otherwise
+ */
+int varp_init(void){
+ int err = 0;
+ struct net_device *dev = NULL;
+
+ dprintf(">\n");
+ varp_table = VarpTable_new();
+ if(!varp_table){
+ err = -ENOMEM;
+ goto exit;
+ }
+ varp_init_mcast_addr(varp_mcaddr);
+ err = vnet_get_device(varp_device, &dev);
+ dprintf("> vnet_get_device(%s)=%d\n", varp_device, err);
+ if(err) goto exit;
+ err = vnet_get_device_address(dev, &varp_ucast_addr);
+ dprintf("> vnet_get_device_address()=%d\n", err);
+ if(err) goto exit;
+ varp_port = htons(VARP_PORT);
+
+ err = varp_open(varp_mcast_addr, varp_ucast_addr, varp_port);
+ dprintf("> varp_open()=%d\n", err);
+ exit:
+ if(dev) dev_put(dev);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Close the varp cache.
+ */
+void varp_exit(void){
+ dprintf(">\n");
+ varp_close();
+ if(varp_table){
+ VarpTable *z = varp_table;
+ varp_table = NULL;
+ VarpTable_free(z);
+ }
+ dprintf("<\n");
+}
+
+MODULE_PARM(varp_mcaddr, "s");
+MODULE_PARM_DESC(varp_mcaddr, "VARP multicast address");
+
+MODULE_PARM(varp_device, "s");
+MODULE_PARM_DESC(varp_device, "VARP network device");
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef _VNET_VARP_H
+#define _VNET_VARP_H
+
+#define CONFIG_VARP_GRATUITOUS 1
+
+struct net_device;
+struct sk_buff;
+struct Vif;
+
+#define DEVICE "xen-br0"
+
+extern int vnet_get_device(const char *name, struct net_device **dev);
+extern int vnet_get_device_address(struct net_device *dev, u32 *addr);
+
+extern int varp_handle_message(struct sk_buff *skb);
+extern int varp_output(struct sk_buff *skb, u32 vnet);
+extern int varp_update(int vnet, unsigned char *vmac, u32 addr);
+
+extern int varp_init(void);
+extern void varp_exit(void);
+
+extern int varp_open(u32 mcaddr, u32 addr, u16 port);
+extern void varp_close(void);
+extern int varp_set_mcast_addr(u32 addr);
+
+extern void varp_print(void);
+
+extern int varp_announce_vif(struct net_device *dev, struct Vif *vif);
+//extern int varp_announce_vifs(struct net_device *dev, struct task_struct *domain);
+
+extern u32 varp_mcast_addr;
+
+
+/* MAC broadcast addr is ff-ff-ff-ff-ff-ff (all 1's).
+ * MAC multicast addr has low bit 1, i.e. 01-00-00-00-00-00.
+ */
+
+/** Test if a MAC address is a multicast or broadcast address.
+ *
+ * @param mac address
+ * @return 1 if it is, 0 if not
+ */
+static inline int mac_is_multicast(u8 mac[ETH_ALEN]){
+ return mac[0] & 1;
+}
+
+/** Test if a MAC address is the broadcast address.
+ *
+ * @param mac address
+ * @return 1 if it is, 0 if not
+ */
+static inline int mac_is_broadcast(u8 mac[ETH_ALEN]){
+ u8 mac_bcast_val[ETH_ALEN] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+ return memcmp(mac, mac_bcast_val, ETH_ALEN) == 0;
+}
+
+/** Test if a MAC address is the all-zero address.
+ *
+ * @param mac address
+ * @return 1 if it is, 0 if not
+ */
+static inline int mac_is_zero(u8 mac[ETH_ALEN]){
+ u8 mac_zero_val[ETH_ALEN] = {};
+ return memcmp(mac, mac_zero_val, ETH_ALEN) == 0;
+}
+
+/** Print format for a mac address. */
+#define MACFMT "%02x:%02x:%02x:%02x:%02x:%02x"
+
+#define MAC6TUPLE(_mac) (_mac)[0], (_mac)[1], (_mac)[2], (_mac)[3], (_mac)[4], (_mac)[5]
+
+/** Get the subnet defined by a netmask and addr.
+ *
+ * @param netmask subnet netmask
+ * @param addr subnet address
+ * @return subnet
+ */
+static inline u32 subnet_net(u32 netmask, u32 addr){
+ return netmask & addr;
+}
+
+/** Get the address within a subnet.
+ *
+ * @param netmask subnet netmask
+ * @param addr address
+ * @return address within the subnet
+ */
+static inline u32 subnet_addr(u32 netmask, u32 addr){
+ return ~netmask & addr;
+}
+
+/** Get the broadcast address for a subnet.
+ *
+ * @param netmask subnet netmask
+ * @param netaddr subnet address
+ * @return subnet broadcast address
+ */
+static inline u32 subnet_broadcast_addr(u32 netmask, u32 netaddr){
+ return subnet_net(netmask, netaddr) | ~netmask;
+}
+
+/** Test if an address corresponds to a subnet broadcast.
+ * True if the address within the subnet is all 1's (in binary).
+ * (even if the address is not in the subnet).
+ *
+ * @param netmask subnet mask
+ * @param add address
+ * @return 1 if it does, 0 otherwise
+ */
+static inline int subnet_broadcast(u32 netmask, u32 addr){
+ return subnet_addr(netmask, INADDR_ANY) == subnet_addr(netmask, addr);
+}
+
+/** Test if an address is in a subnet.
+ *
+ * @param netmask subnet mask
+ * @param netaddr subnet address
+ * @param addr address
+ * @return 1 if it is, 0 otherwise
+ */
+static inline int subnet_local(u32 netmask, u32 netaddr, u32 addr){
+ return subnet_net(netmask, netaddr) == subnet_net(netmask, addr);
+}
+
+#endif /* ! _VNET_VARP_H */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/version.h>
+
+#include <asm/uaccess.h>
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/version.h>
+#include <linux/smp_lock.h>
+#include <net/sock.h>
+
+#include <if_varp.h>
+#include <varp.h>
+
+/* Get macros needed to define system calls as functions in the kernel. */
+#define __KERNEL_SYSCALLS__
+static int errno;
+#include <linux/unistd.h>
+
+#define MODULE_NAME "VARP"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+// Compensate for struct sock fields having 'sk_' added
+// to them in 2.6.
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+
+#define SK_RECEIVE_QUEUE sk_receive_queue
+#define SK_SLEEP sk_sleep
+
+#else
+
+#define SK_RECEIVE_QUEUE receive_queue
+#define SK_SLEEP sleep
+
+#endif
+
+/** @file
+ * Support for the VARP udp sockets.
+ */
+
+static inline mm_segment_t change_fs(mm_segment_t fs){
+ mm_segment_t oldfs = get_fs();
+ set_fs(fs);
+ return oldfs;
+}
+
+/* Replicate the user-space socket API.
+ * The parts we need anyway.
+ */
+
+/* Define the socketcall() syscall.
+ * Multiplexes all the socket-related calls.
+ *
+ * @param call socket call id
+ * @param args arguments (upto 6)
+ * @return call-dependent value
+ */
+static inline _syscall2(int, socketcall,
+ int, call,
+ unsigned long *, args)
+
+int socket(int family, int type, int protocol){
+ unsigned long args[6];
+
+ args[0] = (unsigned long)family;
+ args[1] = (unsigned long)type;
+ args[2] = (unsigned long)protocol;
+ return socketcall(SYS_SOCKET, args);
+}
+
+int bind(int fd, struct sockaddr *umyaddr, int addrlen){
+ unsigned long args[6];
+
+ args[0] = (unsigned long)fd;
+ args[1] = (unsigned long)umyaddr;
+ args[2] = (unsigned long)addrlen;
+ return socketcall(SYS_BIND, args);
+}
+
+int connect(int fd, struct sockaddr *uservaddr, int addrlen){
+ unsigned long args[6];
+
+ args[0] = (unsigned long)fd;
+ args[1] = (unsigned long)uservaddr;
+ args[2] = (unsigned long)addrlen;
+ return socketcall(SYS_CONNECT, args);
+}
+
+int sendto(int fd, void * buff, size_t len,
+ unsigned flags, struct sockaddr *addr,
+ int addr_len){
+ unsigned long args[6];
+
+ args[0] = (unsigned long)fd;
+ args[1] = (unsigned long)buff;
+ args[2] = (unsigned long)len;
+ args[3] = (unsigned long)flags;
+ args[4] = (unsigned long)addr;
+ args[5] = (unsigned long)addr_len;
+ return socketcall(SYS_SENDTO, args);
+}
+
+int recvfrom(int fd, void * ubuf, size_t size,
+ unsigned flags, struct sockaddr *addr,
+ int *addr_len){
+ unsigned long args[6];
+
+ args[0] = (unsigned long)fd;
+ args[1] = (unsigned long)ubuf;
+ args[2] = (unsigned long)size;
+ args[3] = (unsigned long)flags;
+ args[4] = (unsigned long)addr;
+ args[5] = (unsigned long)addr_len;
+ return socketcall(SYS_RECVFROM, args);
+}
+
+int setsockopt(int fd, int level, int optname, void *optval, int optlen){
+ unsigned long args[6];
+
+ args[0] = (unsigned long)fd;
+ args[1] = (unsigned long)level;
+ args[2] = (unsigned long)optname;
+ args[3] = (unsigned long)optval;
+ args[4] = (unsigned long)optlen;
+ return socketcall(SYS_SETSOCKOPT, args);
+}
+
+int getsockopt(int fd, int level, int optname, void *optval, int *optlen){
+ unsigned long args[6];
+
+ args[0] = (unsigned long)fd;
+ args[1] = (unsigned long)level;
+ args[2] = (unsigned long)optname;
+ args[3] = (unsigned long)optval;
+ args[4] = (unsigned long)optlen;
+ return socketcall(SYS_GETSOCKOPT, args);
+}
+
+int shutdown(int fd, int how){
+ unsigned long args[6];
+
+ args[0] = (unsigned long)fd;
+ args[1] = (unsigned long)how;
+ return socketcall(SYS_SHUTDOWN, args);
+}
+
+int getsockname(int fd, struct sockaddr *usockaddr, int *usockaddr_len){
+ unsigned long args[6];
+
+ args[0] = (unsigned long)fd;
+ args[1] = (unsigned long)usockaddr;
+ args[2] = (unsigned long)usockaddr_len;
+ return socketcall(SYS_GETSOCKNAME, args);
+}
+
+/*============================================================================*/
+/** Socket flags. */
+enum {
+ VSOCK_REUSE = 1,
+ VSOCK_BIND = 2,
+ VSOCK_CONNECT = 4,
+ VSOCK_BROADCAST = 8,
+ VSOCK_MULTICAST = 16,
+ };
+
+/** Convert socket flags to a string.
+ *
+ * @param flags flags
+ * @return static string
+ */
+char * socket_flags(int flags){
+ static char s[6];
+ int i = 0;
+ s[i++] = (flags & VSOCK_CONNECT ? 'c' : '-');
+ s[i++] = (flags & VSOCK_BIND ? 'b' : '-');
+ s[i++] = (flags & VSOCK_REUSE ? 'r' : '-');
+ s[i++] = (flags & VSOCK_BROADCAST ? 'B' : '-');
+ s[i++] = (flags & VSOCK_MULTICAST ? 'M' : '-');
+ s[i++] = '\0';
+ return s;
+}
+
+/** The varp multicast socket. */
+int varp_mcast_sock = -1;
+
+/** The varp unicast socket. */
+int varp_ucast_sock = -1;
+
+/** Control flag for whether varp should be running.
+ * If this is set 0 then the varp thread will notice and
+ * (eventually) exit. This is indicated by setting varp_running
+ * to 0.
+ */
+atomic_t varp_run = ATOMIC_INIT(0);
+
+/** State flag indicating whether the varp thread is running. */
+atomic_t varp_running = ATOMIC_INIT(0);
+
+/** Set socket option to reuse address.
+ *
+ * @param sock socket
+ * @param reuse flag
+ * @return 0 on success, error code otherwise
+ */
+int setsock_reuse(int sock, int reuse){
+ int err = 0;
+ err = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse));
+ if(err < 0){
+ eprintf("> setsockopt SO_REUSEADDR: %d %d\n", err, errno);
+ }
+ return err;
+}
+
+/** Set socket broadcast option.
+ *
+ * @param sock socket
+ * @param bcast flag
+ * @return 0 on success, error code otherwise
+ */
+int setsock_broadcast(int sock, int bcast){
+ int err = 0;
+ err = setsockopt(sock, SOL_SOCKET, SO_BROADCAST, &bcast, sizeof(bcast));
+ if(err < 0){
+ eprintf("> setsockopt SO_BROADCAST: %d %d\n", err, errno);
+ }
+ return err;
+}
+
+/** Join a socket to a multicast group.
+ *
+ * @param sock socket
+ * @param saddr multicast address
+ * @return 0 on success, error code otherwise
+ */
+int setsock_multicast(int sock, uint32_t saddr){
+ int err = 0;
+ struct net_device *dev = NULL;
+ u32 addr = 0;
+ struct ip_mreqn mreq = {};
+ int mloop = 0;
+
+ err = vnet_get_device(DEVICE, &dev);
+ if(err){
+ eprintf("> error getting device: %d %d\n", err, errno);
+ goto exit;
+ }
+ err = vnet_get_device_address(dev, &addr);
+ if(err){
+ eprintf("> error getting device address: %d %d\n", err, errno);
+ goto exit;
+ }
+ // See 'man 7 ip' for these options.
+ mreq.imr_multiaddr.s_addr = saddr; // IP multicast address.
+ //mreq.imr_address.s_addr = addr; // Interface IP address.
+ mreq.imr_address.s_addr = INADDR_ANY; // Interface IP address.
+ mreq.imr_ifindex = 0; // Interface index (0 means any).
+ dprintf("> saddr=%u.%u.%u.%u addr=%u.%u.%u.%u ifindex=%d\n",
+ NIPQUAD(saddr), NIPQUAD(addr), mreq.imr_ifindex);
+ err = setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &mloop, sizeof(mloop));
+ if(err < 0){
+ eprintf("> setsockopt IP_MULTICAST_LOOP: %d %d\n", err, errno);
+ goto exit;
+ }
+ err = setsockopt(sock, SOL_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq));
+ if(err < 0){
+ eprintf("> setsockopt IP_ADD_MEMBERSHIP: %d %d\n", err, errno);
+ goto exit;
+ }
+ exit:
+ err = 0; //todo: remove hack
+ return err;
+}
+
+/** Set a socket's multicast ttl (default is 1).
+ * @param sock socket
+ * @param ttl ttl
+ * @return 0 on success, error code otherwise
+ */
+int setsock_multicast_ttl(int sock, uint8_t ttl){
+ int err = 0;
+ err = setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl));
+ return err;
+}
+
+/** Create a socket.
+ * The flags can include VSOCK_REUSE, VSOCK_BROADCAST, VSOCK_CONNECT.
+ *
+ * @param socktype socket type
+ * @param saddr address
+ * @param port port
+ * @param flags flags
+ * @param val return value for the socket connection
+ * @return 0 on success, error code otherwise
+ */
+int create_socket(int socktype, uint32_t saddr, uint32_t port, int flags, int *val){
+ int err = 0;
+ int sock;
+ struct sockaddr_in addr_in;
+ struct sockaddr *addr = (struct sockaddr *)&addr_in;
+ int addr_n = sizeof(addr_in);
+ int reuse, bcast;
+ int sockproto = 0;
+
+ //dprintf(">\n");
+ reuse = (flags & VSOCK_REUSE);
+ bcast = (flags & VSOCK_BROADCAST);
+ addr_in.sin_family = AF_INET;
+ addr_in.sin_addr.s_addr = saddr;
+ addr_in.sin_port = port;
+ dprintf("> flags=%s addr=%u.%u.%u.%u port=%d\n",
+ socket_flags(flags),
+ NIPQUAD(saddr), ntohs(port));
+
+ switch(socktype){
+ case SOCK_DGRAM: sockproto = IPPROTO_UDP; break;
+ case SOCK_STREAM: sockproto = IPPROTO_TCP; break;
+ }
+ sock = socket(AF_INET, socktype, sockproto);
+ if(sock < 0) goto exit;
+ if(reuse){
+ err = setsock_reuse(sock, reuse);
+ if(err < 0) goto exit;
+ }
+ if(bcast){
+ err = setsock_broadcast(sock, bcast);
+ if(err < 0) goto exit;
+ }
+ if(flags & VSOCK_MULTICAST){
+ err = setsock_multicast(sock, saddr);
+ if(err < 0) goto exit;
+ }
+ if(flags & VSOCK_CONNECT){
+ err = connect(sock, addr, addr_n);
+ if(err < 0) goto exit;
+ }
+ if(flags & VSOCK_BIND){
+ err = bind(sock, addr, addr_n);
+ if(err < 0) goto exit;
+ }
+ exit:
+ *val = (err ? -1 : sock);
+ if(err) eprintf("> err=%d errno=%d\n", err, errno);
+ return err;
+}
+
+/** Open the varp multicast socket.
+ *
+ * @param mcaddr multicast address
+ * @param saddr address
+ * @param port port
+ * @param val return parameter for the socket
+ * @return 0 on success, error code otherwise
+ */
+int varp_mcast_open(uint32_t mcaddr, uint32_t saddr, uint16_t port, int *val){
+ int err = 0;
+ int flags = VSOCK_REUSE;
+ int multicast = MULTICAST(mcaddr);
+ int sock = 0;
+ struct sockaddr_in addr_in;
+ struct sockaddr *addr = (struct sockaddr *)&addr_in;
+ int addr_n = sizeof(addr_in);
+
+ dprintf(">\n");
+ flags |= VSOCK_MULTICAST;
+ flags |= VSOCK_BROADCAST;
+
+ err = create_socket(SOCK_DGRAM, mcaddr, port, flags, &sock);
+ if(err < 0) goto exit;
+ if(multicast){
+ err = setsock_multicast_ttl(sock, 1);
+ if(err < 0) goto exit;
+ }
+ if(0){
+ addr_in.sin_family = AF_INET;
+ addr_in.sin_addr.s_addr = saddr;
+ addr_in.sin_port = port;
+ err = bind(sock, addr, addr_n);
+ if(err < 0){
+ eprintf("> bind: %d %d\n", err, errno);
+ goto exit;
+ }
+ }
+ if(0){
+ struct sockaddr_in self = {};
+ int self_n;
+ getsockname(sock, (struct sockaddr *)&self, &self_n);
+ dprintf("> sockname sock=%d addr=%u.%u.%u.%u port=%d\n",
+ sock, NIPQUAD(saddr), ntohs(port));
+ }
+ exit:
+ if(err){
+ shutdown(sock, 2);
+ }
+ *val = (err ? -1 : sock);
+ dprintf("< err=%d val=%d\n", err, *val);
+ return err;
+}
+
+/** Open the varp unicast socket.
+ *
+ * @param addr address
+ * @param port port
+ * @param val return parameter for the socket
+ * @return 0 on success, error code otherwise
+ */
+int varp_ucast_open(uint32_t addr, u16 port, int *val){
+ int err = 0;
+ int flags = VSOCK_BIND | VSOCK_REUSE;
+ dprintf(">\n");
+ err = create_socket(SOCK_DGRAM, addr, port, flags, val);
+ dprintf("< err=%d val=%d\n", err, *val);
+ return err;
+}
+
+/* Here because inline in 'socket.c'. */
+#ifndef sockfd_put
+#define sockfd_put(sock) fput((sock)->file)
+#endif
+
+/** Get the next skb from a socket's receive queue.
+ *
+ * @param fd socket file descriptor
+ * @return skb or NULL
+ */
+static struct sk_buff *get_sock_skb(int fd){
+ int err = 0;
+ struct sk_buff *skb = NULL;
+ struct socket *sock = NULL;
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock){
+ dprintf("> no sock for fd=%d\n", fd);
+ goto exit;
+ }
+ skb = skb_dequeue(&sock->sk->SK_RECEIVE_QUEUE);
+ //skb = skb_recv_datagram(sock->sk, 0, 1, &recv_err);
+ sockfd_put(sock);
+ exit:
+ return skb;
+}
+
+/** Handle the next skb on a socket (if any).
+ *
+ * @param fd socket file descriptor
+ * @return 1 if there was an skb, 0 otherwise
+ */
+static int handle_sock_skb(int fd){
+ int ret = 0;
+ struct sk_buff *skb = get_sock_skb(fd);
+ if(skb){
+ ret = 1;
+ dprintf("> skb fd=%d skb=%p\n", fd, skb);
+ varp_handle_message(skb);
+ kfree_skb(skb);
+ }
+ return ret;
+}
+
+/** Add a wait queue to a socket.
+ *
+ * @param fd socket file descriptor
+ * @param waitq queue
+ * @return 0 on success, error code otherwise
+ */
+int sock_add_wait_queue(int fd, wait_queue_t *waitq){
+ int err = 0;
+ struct socket *sock = NULL;
+
+ dprintf("> fd=%d\n", fd);
+ sock = sockfd_lookup(fd, &err);
+ if (!sock) goto exit;
+ add_wait_queue(sock->sk->SK_SLEEP, waitq);
+ sockfd_put(sock);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Remove a wait queue from a socket.
+ *
+ * @param fd socket file descriptor
+ * @param waitq queue
+ * @return 0 on success, error code otherwise
+ */
+int sock_remove_wait_queue(int fd, wait_queue_t *waitq){
+ int err = 0;
+ struct socket *sock = NULL;
+
+ sock = sockfd_lookup(fd, &err);
+ if (!sock) goto exit;
+ remove_wait_queue(sock->sk->SK_SLEEP, waitq);
+ sockfd_put(sock);
+ exit:
+ return err;
+}
+
+/** Loop handling the varp sockets.
+ * We use kernel API for this (waitqueue, schedule_timeout) instead
+ * of select because the select syscall was returning EFAULT. Oh well.
+ *
+ * @param arg arguments
+ * @return exit code
+ */
+int varp_main(void *arg){
+ int err = 0;
+ long timeout = 3 * HZ;
+ int count = 0;
+ int n = 0;
+ DECLARE_WAITQUEUE(mcast_wait, current);
+ DECLARE_WAITQUEUE(ucast_wait, current);
+
+ dprintf("> start\n");
+ atomic_set(&varp_running, 1);
+ err = sock_add_wait_queue(varp_mcast_sock, &mcast_wait);
+ err = sock_add_wait_queue(varp_ucast_sock, &ucast_wait);
+ for(n = 1; atomic_read(&varp_run) == 1; n++){
+ //dprintf("> n=%d\n", n);
+ count = 0;
+ count += handle_sock_skb(varp_mcast_sock);
+ count += handle_sock_skb(varp_ucast_sock);
+ if(!count){
+ // No skbs were handled, so go back to sleep.
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(timeout);
+ current->state = TASK_RUNNING;
+ }
+ }
+ sock_remove_wait_queue(varp_mcast_sock, &mcast_wait);
+ sock_remove_wait_queue(varp_ucast_sock, &ucast_wait);
+ atomic_set(&varp_running, 0);
+ //MOD_DEC_USE_COUNT;
+ dprintf("< stop err=%d\n", err);
+ return err;
+}
+
+/** Start the varp thread.
+ *
+ * @return 0 on success, error code otherwise
+ */
+int varp_start(void){
+ int err = 0;
+ void *args = NULL;
+ int flags = 0;
+ long pid = 0;
+
+ dprintf(">\n");
+ //flags |= CLONE_VM;
+ flags |= CLONE_FS;
+ flags |= CLONE_FILES;
+ flags |= CLONE_SIGHAND;
+ atomic_set(&varp_run, 1);
+ atomic_set(&varp_running, 0);
+ pid = kernel_thread(varp_main, args, flags);
+ dprintf("< pid=%ld\n", pid);
+ return err;
+}
+
+/** Close the varp sockets and stop the thread handling them.
+ */
+void varp_close(void){
+ mm_segment_t oldfs;
+ long timeout = 1 * HZ;
+ int tries = 10;
+ dprintf(">\n");
+ // Tell the varp thread to stop and wait a while for it.
+ atomic_set(&varp_run, 0);
+ while(atomic_read(&varp_running) && tries-- > 0){
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(timeout);
+ current->state = TASK_RUNNING;
+ }
+ // Close the sockets.
+ oldfs = change_fs(KERNEL_DS);
+ if(varp_mcast_sock > 0){
+ shutdown(varp_mcast_sock, 2);
+ varp_mcast_sock = -1;
+ }
+ if(varp_ucast_sock > 0){
+ shutdown(varp_ucast_sock, 2);
+ varp_ucast_sock = -1;
+ }
+ set_fs(oldfs);
+ //MOD_DEC_USE_COUNT;
+ dprintf("<\n");
+}
+
+/** Open the varp sockets and start the thread handling them.
+ *
+ * @param mcaddr multicast address
+ * @param addr unicast address
+ * @param port port
+ * @return 0 on success, error code otherwise
+ */
+int varp_open(u32 mcaddr, u32 addr, u16 port){
+ int err = 0;
+ mm_segment_t oldfs;
+
+ //MOD_INC_USE_COUNT;
+ dprintf("> mcaddr=%u.%u.%u.%u addr=%u.%u.%u.%u port=%u\n",
+ NIPQUAD(mcaddr), NIPQUAD(addr), ntohs(port));
+ //MOD_INC_USE_COUNT;
+ oldfs = change_fs(KERNEL_DS);
+ err = varp_mcast_open(mcaddr, addr, port, &varp_mcast_sock);
+ if(err < 0 ) goto exit;
+ err = varp_ucast_open(INADDR_ANY, port, &varp_ucast_sock);
+ if(err < 0 ) goto exit;
+ set_fs(oldfs);
+ err = varp_start();
+ exit:
+ set_fs(oldfs);
+ if(err){
+ varp_close();
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/string.h>
+
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/udp.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+
+#include <etherip.h>
+#include <if_varp.h>
+#include <vnet_dev.h>
+#include <vif.h>
+#include "allocate.h"
+#include "hash_table.h"
+#include "sys_net.h"
+#include "sys_string.h"
+
+#define MODULE_NAME "VNET"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+/** Table of vifs indexed by VifKey. */
+HashTable *vif_table = NULL;
+
+void vif_decref(Vif *vif){
+ if(!vif) return;
+ if(atomic_dec_and_test(&vif->refcount)){
+ kfree(vif);
+ }
+}
+
+void vif_incref(Vif *vif){
+ if(!vif) return;
+ atomic_inc(&vif->refcount);
+}
+
+/** Hash function for keys in the vif table.
+ * Hashes the vnet id and mac.
+ *
+ * @param k key (VifKey)
+ * @return hashcode
+ */
+Hashcode vif_key_hash_fn(void *k){
+ VifKey *key = k;
+ Hashcode h;
+ h = hash_2ul(key->vnet,
+ (key->vmac.mac[0] << 24) |
+ (key->vmac.mac[1] << 16) |
+ (key->vmac.mac[2] << 8) |
+ (key->vmac.mac[3] ));
+ h = hash_hul(h,
+ (key->vmac.mac[4] << 8) |
+ (key->vmac.mac[5] ));
+ return h;
+}
+
+
+/** Test equality for keys in the vif table.
+ * Compares vnet and mac.
+ *
+ * @param k1 key to compare (VifKey)
+ * @param k2 key to compare (VifKey)
+ * @return 1 if equal, 0 otherwise
+ */
+int vif_key_equal_fn(void *k1, void *k2){
+ VifKey *key1 = k1;
+ VifKey *key2 = k2;
+ return (key1->vnet == key2->vnet) && (memcmp(key1->vmac.mac, key2->vmac.mac, ETH_ALEN) == 0);
+}
+
+/** Free an entry in the vif table.
+ *
+ * @param table containing table
+ * @param entry entry to free
+ */
+static void vif_entry_free_fn(HashTable *table, HTEntry *entry){
+ Vif *vif;
+ if(!entry) return;
+ vif = entry->value;
+ if(vif){
+ vif_decref(vif);
+ }
+ HTEntry_free(entry);
+}
+
+/** Lookup a vif.
+ *
+ * @param vnet vnet id
+ * @param mac MAC address
+ * @return 0 on success, -ENOENT otherwise
+ */
+int vif_lookup(int vnet, Vmac *vmac, Vif **vif){
+ int err = 0;
+ VifKey key = {};
+ HTEntry *entry = NULL;
+
+ key.vnet = vnet;
+ key.vmac = *vmac;
+ entry = HashTable_get_entry(vif_table, &key);
+ if(entry){
+ *vif = entry->value;
+ vif_incref(*vif);
+ } else {
+ *vif = NULL;
+ err = -ENOENT;
+ }
+ //dprintf("< err=%d addr=" IPFMT "\n", err, NIPQUAD(*coaddr));
+ return err;
+}
+
+/** Create a new vif.
+ *
+ * @param vnet vnet id
+ * @param mac MAC address
+ * @return 0 on success, negative error code otherwise
+ */
+int vif_add(int vnet, Vmac *vmac, Vif **val){
+ int err = 0;
+ Vif *vif = NULL;
+ HTEntry *entry;
+ dprintf("> vnet=%d\n", vnet);
+ vif = ALLOCATE(Vif);
+ if(!vif){
+ err = -ENOMEM;
+ goto exit;
+ }
+ atomic_set(&vif->refcount, 1);
+ vif->vnet = vnet;
+ vif->vmac = *vmac;
+ entry = HashTable_add(vif_table, vif, vif);
+ if(!entry){
+ err = -ENOMEM;
+ deallocate(vif);
+ vif = NULL;
+ goto exit;
+ }
+ vif_incref(vif);
+ exit:
+ *val = (err ? NULL : vif);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Delete an entry.
+ *
+ * @param vnet vnet id
+ * @param mac MAC address
+ * @param coaddr return parameter for care-of address
+ * @return number of entries deleted, or negative error code
+ */
+int vif_remove(int vnet, Vmac *vmac){
+ int err = 0;
+ VifKey key = { .vnet = vnet, .vmac = *vmac };
+ //dprintf("> vnet=%d addr=%u.%u.%u.%u\n", vnet, NIPQUAD(coaddr));
+ err = HashTable_remove(vif_table, &key);
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+int vif_find(int vnet, Vmac *vmac, int create, Vif **vif){
+ int err = 0;
+
+ err = vif_lookup(vnet, vmac, vif);
+ if(err && create){
+ err = vif_add(vnet, vmac, vif);
+ }
+ return err;
+}
+
+void vif_purge(void){
+ HashTable_clear(vif_table);
+}
+
+int vif_create(int vnet, Vmac *vmac, Vif **vif){
+ int err = 0;
+
+ dprintf(">\n");
+ if(!vif_lookup(vnet, vmac, vif)){
+ err = -EEXIST;
+ goto exit;
+ }
+ dprintf("> vif_add...\n");
+ err = vif_add(vnet, vmac, vif);
+ exit:
+ if(err){
+ *vif = NULL;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Create a vif.
+ *
+ * @param vnet vnet id
+ * @param mac mac address (as a string)
+ * @return 0 on success, error code otherwise
+ */
+int mkvif(int vnet, char *mac){
+ int err = 0;
+ Vmac vmac = {};
+ Vif *vif = NULL;
+ dprintf("> vnet=%d mac=%s\n", vnet, mac);
+ err = mac_aton(mac, vmac.mac);
+ if(err) goto exit;
+ err = vif_create(vnet, &vmac, &vif);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Initialize the vif table.
+ *
+ * @return 0 on success, error code otherwise
+ */
+int vif_init(void){
+ int err = 0;
+ dprintf(">\n");
+ vif_table = HashTable_new(0);
+ if(!vif_table){
+ err = -ENOMEM;
+ goto exit;
+ }
+ vif_table->entry_free_fn = vif_entry_free_fn;
+ vif_table->key_hash_fn = vif_key_hash_fn;
+ vif_table->key_equal_fn = vif_key_equal_fn;
+
+ // Some vifs for testing.
+ //mkvif(1, "aa:00:00:00:20:11");
+ //mkvif(2, "aa:00:00:00:20:12");
+ exit:
+ if(err < 0) wprintf("< err=%d\n", err);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+void vif_exit(void){
+ HashTable_free(vif_table);
+}
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef _VNET_VIF_H_
+#define _VNET_VIF_H_
+
+#include <if_varp.h>
+struct net_device;
+
+/** Key for entries in the vif table. */
+typedef struct VifKey {
+ int vnet;
+ Vmac vmac;
+} VifKey;
+
+typedef struct Vif {
+ int vnet;
+ Vmac vmac;
+ struct net_device *dev;
+ atomic_t refcount;
+} Vif;
+
+struct HashTable;
+extern struct HashTable *vif_table;
+
+extern void vif_decref(Vif *vif);
+extern void vif_incref(Vif *vif);
+
+extern int vif_create(int vnet, Vmac *vmac, Vif **vif);
+
+extern int vif_add(int vnet, Vmac *vmac, Vif **vif);
+extern int vif_lookup(int vnet, Vmac *vmac, Vif **vif);
+extern int vif_remove(int vnet, Vmac *vmac);
+extern int vif_find(int vnet, Vmac *vmac, int create, Vif **vif);
+extern void vif_purge(void);
+
+extern int vif_init(void);
+extern void vif_exit(void);
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/version.h>
+#include <linux/errno.h>
+
+#include <linux/string.h>
+
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <net/route.h>
+#include <linux/skbuff.h>
+#include <net/checksum.h>
+
+#include <tunnel.h>
+#include <sa.h>
+#include <varp.h>
+#include <if_varp.h>
+#include <esp.h>
+#include <etherip.h>
+#include <random.h>
+#include <tunnel.h>
+
+#include <vnet_dev.h>
+#include <vnet.h>
+#include <vif.h>
+#include <vnet_ioctl.h>
+#include <sa_algorithm.h>
+
+#include "allocate.h"
+#include "hash_table.h"
+#include "sys_net.h"
+#include "sys_string.h"
+
+#define MODULE_NAME "VNET"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+/** Default vnet security level.
+ */
+int vnet_security_default = SA_AUTH ; //| SA_CONF;
+
+/** Key for entries in the vnet address table. */
+typedef struct VnetAddrKey {
+ /** Vnet id. */
+ int vnet;
+ /** MAC address. */
+ unsigned char mac[ETH_ALEN];
+} VnetAddrKey;
+
+/** The physical vnet. */
+Vnet *vnet_physical = NULL;
+
+/** Table of vnets indexed by id. */
+static HashTable *vnet_table = NULL;
+
+/** Decrement reference count, freeing if zero.
+ *
+ * @param info vnet (OK if null)
+ */
+void Vnet_decref(Vnet *info){
+ if(!info) return;
+ if(atomic_dec_and_test(&info->refcount)){
+ dprintf("> free vnet=%u\n", info->vnet);
+ vnet_dev_remove(info);
+ deallocate(info);
+ }
+}
+
+/** Increment reference count.
+ *
+ * @param info vnet (OK if null)
+ */
+void Vnet_incref(Vnet *info){
+ if(!info) return;
+ atomic_inc(&info->refcount);
+}
+
+/** Allocate a vnet, setting reference count to 1.
+ *
+ * @param info return parameter for vnet
+ * @return 0 on success, error code otherwise
+ */
+int Vnet_alloc(Vnet **info){
+ int err = 0;
+ *info = ALLOCATE(Vnet);
+ if(*info){
+ atomic_set(&(*info)->refcount, 1);
+ } else {
+ err = -ENOMEM;
+ }
+ return err;
+}
+
+/** Add a vnet to the table under its vnet id.
+ *
+ * @param info vnet to add
+ * @return 0 on success, error code otherwise
+ */
+int Vnet_add(Vnet *info){
+ int err = 0;
+ HTEntry *entry = NULL;
+ // Vnet_del(info->vnet); //todo: Delete existing vnet info?
+ Vnet_incref(info);
+ entry = HashTable_add(vnet_table, HKEY(info->vnet), info);
+ if(!entry){
+ err = -ENOMEM;
+ Vnet_decref(info);
+ }
+ return err;
+}
+
+/** Remove a vnet from the table.
+ *
+ * @param vnet id of vnet to remove
+ * @return number of vnets removed
+ */
+int Vnet_del(vnetid_t vnet){
+ return HashTable_remove(vnet_table, HKEY(vnet));
+}
+
+/** Lookup a vnet by id.
+ * References the vnet on success - the caller must decref.
+ *
+ * @param vnet vnet id
+ * @param info return parameter for vnet
+ * @return 0 on sucess, -ENOENT if no vnet found
+ */
+int Vnet_lookup(vnetid_t vnet, Vnet **info){
+ int err = 0;
+ dprintf("> vnet=%u info=%p\n", vnet, info);
+ dprintf("> vnet_table=%p\n",vnet_table);
+ *info = HashTable_get(vnet_table, HKEY(vnet));
+ if(*info){
+ Vnet_incref(*info);
+ } else {
+ err = -ENOENT;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Free an entry in the vnet table.
+ *
+ * @param table containing table
+ * @param entry to free
+ */
+static void vnet_entry_free_fn(HashTable *table, HTEntry *entry){
+ Vnet *info;
+ if(!entry) return;
+ info = entry->value;
+ if(info){
+ vnet_dev_remove(info);
+ Vnet_decref(info);
+ }
+ HTEntry_free(entry);
+}
+
+/** Setup some vnet entries (for testing).
+ * Vnet 1 is physical, vnets 2 to 10 are insecure, vnets above
+ * 10 are secure.
+ *
+ * @return 0 on success, negative error code otherwise
+ */
+static int vnet_setup(void){
+ int err = 0;
+ int i, n = 5; //20;
+ int security = vnet_security_default;
+ Vnet *vnet;
+
+ dprintf(">\n");
+ for(i=0; i<n; i++){
+ err = Vnet_alloc(&vnet);
+ if(err) break;
+ vnet->vnet = VNET_VIF + i;
+ vnet->security = (vnet->vnet > 10 ? security : 0);
+ //err = Vnet_add(vnet);
+ err = Vnet_create(vnet);
+ if(err) break;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Initialize the vnet table and the physical vnet.
+ *
+ * @return 0 on success, error code otherwise
+ */
+int vnet_init(void){
+ int err = 0;
+
+ dprintf(">\n");
+ vnet_table = HashTable_new(0);
+ dprintf("> vnet_table=%p\n", vnet_table);
+ if(!vnet_table){
+ err = -ENOMEM;
+ goto exit;
+ }
+ vnet_table->entry_free_fn = vnet_entry_free_fn;
+
+ err = Vnet_alloc(&vnet_physical);
+ if(err) goto exit;
+ vnet_physical->vnet = VNET_PHYS;
+ vnet_physical->security = 0;
+ err = Vnet_add(vnet_physical);
+ if(err) goto exit;
+ err = vnet_setup();
+ if(err) goto exit;
+ err = varp_init();
+ if(err) goto exit;
+ err = vif_init();
+ exit:
+ if(err < 0) wprintf("< err=%d\n", err);
+ return err;
+}
+
+void vnet_exit(void){
+ vif_exit();
+ varp_exit();
+ HashTable_free(vnet_table);
+ vnet_table = NULL;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)
+
+static inline int skb_route(struct sk_buff *skb, struct rtable **prt){
+ int err = 0;
+ struct flowi fl = {
+ .oif = skb->dev->ifindex,
+ .nl_u = {
+ .ip4_u = {
+ .daddr = skb->nh.iph->daddr,
+ .saddr = skb->nh.iph->saddr,
+ .tos = skb->nh.iph->tos,
+ }
+ }
+ };
+
+ err = ip_route_output_key(prt, &fl);
+ return err;
+}
+
+#else
+
+static inline int skb_route(struct sk_buff *skb, struct rtable **prt){
+ int err = 0;
+ struct rt_key key = { };
+ key.dst = skb->nh.iph->daddr;
+ key.src = skb->nh.iph->saddr;
+ key.tos = skb->nh.iph->tos;
+ key.oif = skb->dev->ifindex;
+ err = ip_route_output_key(prt, &key);
+ return err;
+}
+
+#endif
+
+inline int skb_xmit(struct sk_buff *skb){
+ int err = 0;
+ struct rtable *rt = NULL;
+
+ dprintf("> skb=%p dev=%s\n", skb, skb->dev->name);
+
+ skb->protocol = htons(ETH_P_IP);
+ err = skb_route(skb, &rt);
+ if(err) goto exit;
+ skb->dst = &rt->u.dst;
+
+ ip_select_ident(skb->nh.iph, &rt->u.dst, NULL);
+
+ if(skb->nh.iph->saddr == 0){
+ skb->nh.iph->saddr = rt->rt_src;
+ }
+
+ skb->nh.iph->check = 0;
+ skb->nh.iph->check = ip_compute_csum(skb->nh.raw, (skb->nh.iph->ihl << 2));
+
+ err = neigh_compat_output(skb);
+
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Called when a vif sends a packet to the network.
+ * Encapsulates the packet for its vnet and forwards it.
+ *
+ * @param skb packet
+ * @return 0 on success, error code otherwise
+ *
+ * @todo fixme
+ */
+int vnet_skb_send(struct sk_buff *skb, u32 vnet){
+ int err = 0;
+ Vif *vif = NULL;
+
+ dprintf("> skb=%p vnet=%u\n", skb, vnet);
+ if(vnet == VNET_PHYS || !vnet){
+ // For completeness, send direct to the network.
+ if(skb->dev){
+ err = skb_xmit(skb);
+ } else {
+ // Can't assume eth0 - might be nbe-br or other. Need to route.
+ struct net_device *dev = NULL;
+ err = vnet_get_device(DEVICE, &dev);
+ if(err) goto exit;
+ skb->dev = dev;
+ err = skb_xmit(skb);
+ dev_put(dev);
+ }
+ } else {
+ dprintf("> varp_output\n");
+ err = varp_output(skb, vnet);
+ }
+ //dprintf("< err=%d\n", err);
+ exit:
+ if(vif) vif_decref(vif);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Receive an skb for a vnet.
+ * If the dest is broadcast, goes to all vifs on the vnet.
+ * If the dest is unicast, goes to addressed vif on vnet.
+ * For each vif we set the packet dev and receive the packet.
+ *
+ * The packet must have skb->mac.raw set and skb->data must point
+ * after the device (ethernet) header.
+ *
+ * @param skb packet
+ * @param vnet packet vnet
+ * @param vmac packet vmac
+ * @return 0 on success, error code otherwise
+ */
+#if 1
+int vnet_skb_recv(struct sk_buff *skb, u32 vnet, Vmac *vmac){
+ // Receive the skb for a vnet.
+ // We make the skb come out of the vif for the vnet, and
+ // let ethernet bridging forward it to related interfaces.
+ int err = 0;
+ Vnet *info = NULL;
+
+ dprintf("> vnet=%u mac=%s\n", vnet, mac_ntoa(vmac->mac));
+ err = Vnet_lookup(vnet, &info);
+ if(err) goto exit;
+ skb->dev = info->dev;
+ dprintf("> netif_rx dev=%s\n", skb->dev->name);
+ netif_rx(skb);
+ exit:
+ if(info) Vnet_decref(info);
+ if(err){
+ kfree_skb(skb);
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+#else
+int vnet_skb_recv(struct sk_buff *skb, u32 vnet, Vmac *vmac){
+ int err = 0;
+ Vif *vif = NULL;
+
+ dprintf("> vnet=%u mac=%s\n", vnet, mac_ntoa(vmac->mac));
+ if(mac_is_multicast(vmac->mac)){
+ HashTable_for_decl(entry);
+ int count = 0;
+ struct sk_buff *new_skb;
+
+ HashTable_for_each(entry, vif_table){
+ vif = entry->value;
+ if(vif->vnet != vnet) continue;
+ count++;
+ new_skb = skb_copy(skb, GFP_ATOMIC);
+ if(!new_skb) break;
+ new_skb->dev = vif->dev;
+ dprintf("> %d] netif_rx dev=%s\n", count, new_skb->dev->name);
+ netif_rx(new_skb);
+ }
+ kfree_skb(skb);
+ } else {
+ err = vif_lookup(vnet, vmac, &vif);
+ if(err){
+ kfree_skb(skb);
+ goto exit;
+ }
+ skb->dev = vif->dev;
+ dprintf("> netif_rx dev=%s\n", skb->dev->name);
+ netif_rx(skb);
+ }
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+#endif
+
+/** Check validity of an incoming IP frame.
+ *
+ * @param skb frame
+ * @return 0 if ok, error code otherwise
+ *
+ * @todo fixme Can prob skip most of this because linux will have done it.
+ * @todo Only need the vnet skb context check.
+ */
+int check_ip_frame(struct sk_buff *skb){
+ int err = -EINVAL;
+ struct iphdr* iph;
+ struct net_device *dev;
+ __u32 len;
+ __u16 check;
+
+#if 0
+ if(skb->context){
+ // Todo: After ESP want to skip most checks (including checksum),
+ // Todo: but in general may not want to skip all checks on detunnel.
+ //dprintf("> Skip check, has context\n");
+ err = 0;
+ goto exit;
+ }
+#endif
+ // Check we have enough for an ip header - the skb passed should
+ // have data pointing at the eth header and skb->len should include
+ // that. skb->nh should already have been set. Let the indvidual
+ // protocol handlers worry about the exact ip header len
+ // (i.e. whether any ip options are set).
+ dev = skb->dev;
+
+ if(skb->len < ETH_HLEN + sizeof(struct iphdr)){
+ wprintf("> packet too short for ip header\n");
+ goto exit;
+ }
+
+ iph = skb->nh.iph;
+ /*
+ * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum.
+ *
+ * Is the datagram acceptable?
+ *
+ * 1. Length at least the size of an ip header
+ * 2. Version of 4
+ * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
+ * 4. Doesn't have a bogus length
+ */
+ if (iph->ihl < 5 || iph->version != 4){
+ wprintf("> len and version check failed\n");
+ goto exit;
+ }
+ if(skb->len < ETH_HLEN + (iph->ihl << 2)){
+ wprintf("> packet too short for given ihl\n");
+ goto exit;
+ }
+
+ check = iph->check;
+ //iph->check = 0;
+ //iph->check = compute_cksum((__u16 *)iph, (iph->ihl << 1));
+ if(iph->check != check){
+ wprintf("> invalid checksum\n");
+ goto exit;
+ }
+
+ len = ntohs(iph->tot_len);
+ if (skb->len < len + ETH_HLEN || len < (iph->ihl << 2)){
+ wprintf("> packet too short for tot_len\n");
+ goto exit;
+ }
+ skb->h.raw = skb->nh.raw + (iph->ihl << 2);
+ err = 0;
+ exit:
+ return err;
+}
+
+/** Determine ESP security mode for a new SA.
+ *
+ * @param spi incoming spi
+ * @param protocol incoming protocol
+ * @param addr source address
+ * @return security level or negative error code
+ *
+ * @todo Need to check spi, and do some lookup for security params.
+ */
+int vnet_sa_security(u32 spi, int protocol, u32 addr){
+ int security = vnet_security_default;
+ dprintf("< security=%x\n", security);
+ return security;
+}
+
+/** Create a new SA for incoming traffic.
+ *
+ * @param spi incoming spi
+ * @param protocol incoming protocol
+ * @param addr source address
+ * @param sa return parameter for SA
+ * @return 0 on success, error code otherwise
+ */
+int vnet_sa_create(u32 spi, int protocol, u32 addr, SAState **sa){
+ int err = 0;
+ int security = vnet_sa_security(spi, protocol, addr);
+ if(security < 0){
+ err = security;
+ goto exit;
+ }
+ err = sa_create(security, spi, protocol, addr, sa);
+ exit:
+ return err;
+}
+
+/** Check that a context has the correct properties w.r.t. a vnet.
+ * The context must be secure if the vnet requires security.
+ *
+ * @param vnet vnet id
+ * @param context context
+ * @return 0 on success, error code otherwise
+ *
+ * @todo Need to check that the sa provides the correct security level.
+ */
+int vnet_check_context(int vnet, SkbContext *context, Vnet **val){
+ int err = 0;
+ Vnet *info = NULL;
+ SAState *sa = NULL;
+
+ err = Vnet_lookup(vnet, &info);
+ if(err){
+ wprintf("> No vnet %d\n", vnet);
+ goto exit;
+ }
+ if(!info->security) goto exit;
+ err = -EINVAL;
+ if(!context){
+ wprintf("> No security context\n");
+ goto exit;
+ }
+ if(context->protocol != IPPROTO_ESP){
+ wprintf("> Invalid protocol: wanted %d, got %d\n", IPPROTO_ESP, context->protocol);
+ goto exit;
+ }
+ sa = context->data;
+ //todo: Check security properties of the SA are correct w.r.t. the vnet.
+ //Something like sa->security == info->security;
+ err = 0;
+ exit:
+ *val = info;
+ return err;
+}
+
+/** Open function for SA tunnels.
+ *
+ * @param tunnel to open
+ * @return 0 on success, error code otherwise
+ */
+static int sa_tunnel_open(Tunnel *tunnel){
+ int err = 0;
+ //dprintf(">\n");
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Close function for SA tunnels.
+ *
+ * @param tunnel to close (OK if null)
+ */
+static void sa_tunnel_close(Tunnel *tunnel){
+ SAState *sa;
+ dprintf(">\n");
+ if(!tunnel) return;
+ sa = tunnel->data;
+ if(!sa) return;
+ SAState_decref(sa);
+ tunnel->data = NULL;
+ dprintf("<\n");
+}
+
+/** Packet send function for SA tunnels.
+ *
+ * @param tunnel to send on
+ * @param skb packet to send
+ * @return 0 on success, negative error code on error
+ */
+static int sa_tunnel_send(Tunnel *tunnel, struct sk_buff *skb){
+ int err = -EINVAL;
+ SAState *sa;
+ //dprintf("> tunnel=%p\n", tunnel);
+ if(!tunnel){
+ wprintf("> Null tunnel!\n");
+ goto exit;
+ }
+ sa = tunnel->data;
+ if(!sa){
+ wprintf("> Null SA!\n");
+ goto exit;
+ }
+ err = SAState_send(sa, skb, tunnel->base);
+ exit:
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Functions used by SA tunnels. */
+static TunnelType _sa_tunnel_type = {
+ .name = "SA",
+ .open = sa_tunnel_open,
+ .close = sa_tunnel_close,
+ .send = sa_tunnel_send
+};
+
+/** Functions used by SA tunnels. */
+TunnelType *sa_tunnel_type = &_sa_tunnel_type;
+
+/** Open a tunnel for a vnet to a given address.
+ *
+ * @param vnet vnet id
+ * @param addr destination address
+ * @param tunnel return parameter
+ * @return 0 on success, error code otherwise
+ */
+int vnet_tunnel_open(u32 vnet, u32 addr, Tunnel **tunnel){
+ extern TunnelType *etherip_tunnel_type;
+ int err = 0;
+ Vnet *info = NULL;
+ Tunnel *base_tunnel = NULL;
+ Tunnel *sa_tunnel = NULL;
+ Tunnel *etherip_tunnel = NULL;
+
+ dprintf("> vnet=%u addr=" IPFMT "\n", vnet, NIPQUAD(addr));
+ err = Vnet_lookup(vnet, &info);
+ dprintf("> Vnet_lookup=%d\n", err);
+ if(err) goto exit;
+ if(info->security){
+ SAState *sa = NULL;
+ dprintf("> security=%d\n", info->security);
+ err = Tunnel_create(sa_tunnel_type, vnet, addr, base_tunnel, &sa_tunnel);
+ if(err) goto exit;
+ dprintf("> sa_tunnel=%p\n", sa_tunnel);
+ err = sa_create(info->security, 0, IPPROTO_ESP, addr, &sa);
+ if(err) goto exit;
+ sa_tunnel->data = sa;
+ dprintf("> sa=%p\n", sa);
+ base_tunnel = sa_tunnel;
+ }
+ err = Tunnel_create(etherip_tunnel_type, vnet, addr, base_tunnel, ðerip_tunnel);
+ if(err) goto exit;
+ err = Tunnel_add(etherip_tunnel);
+ exit:
+ Tunnel_decref(sa_tunnel);
+ Vnet_decref(info);
+ if(err){
+ *tunnel = NULL;
+ } else {
+ *tunnel = etherip_tunnel;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Lookup a tunnel for a vnet to a given address.
+ * Uses an existing tunnel if there is one.
+ *
+ * @param vnet vnet id
+ * @param addr care-of address
+ * @param tunnel return parameter
+ * @return 0 on success, error code otherwise
+ */
+int vnet_tunnel_lookup(u32 vnet, u32 addr, Tunnel **tunnel){
+ int err = 0;
+ dprintf("> vnet=%d addr=" IPFMT "\n", vnet, NIPQUAD(addr));
+ *tunnel = Tunnel_lookup(vnet, addr);
+ if(!*tunnel){
+ err = vnet_tunnel_open(vnet, addr, tunnel);
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Send a packet on the appropriate tunnel.
+ *
+ * @param vnet vnet
+ * @param addr tunnel endpoint
+ * @param skb packet
+ * @return 0 on success, error code otherwise
+ */
+int vnet_tunnel_send(vnetid_t vnet, vnetaddr_t addr, struct sk_buff *skb){
+ int err = 0;
+ Tunnel *tunnel = NULL;
+ dprintf("> vnet=%u addr=" IPFMT "\n", vnet, NIPQUAD(addr));
+ err = vnet_tunnel_lookup(vnet, addr, &tunnel);
+ if(err) goto exit;
+ err = Tunnel_send(tunnel, skb);
+ Tunnel_decref(tunnel);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+static void __exit vnet_module_exit(void){
+ ProcFS_exit();
+ sa_table_exit();
+ vnet_exit();
+ esp_module_exit();
+ etherip_module_exit();
+ tunnel_module_init();
+ random_module_exit();
+}
+
+/** Initialize the vnet module.
+ * Failure is fatal.
+ *
+ * @return 0 on success, error code otherwise
+ */
+static int __init vnet_module_init(void){
+ int err = 0;
+
+ dprintf(">\n");
+ err = random_module_init();
+ if(err) wprintf("> random_module_init err=%d\n", err);
+ if(err) goto exit;
+ err = tunnel_module_init();
+ if(err) wprintf("> tunnel_module_init err=%d\n", err);
+ if(err) goto exit;
+ err = etherip_module_init();
+ if(err) wprintf("> etherip_module_init err=%d\n", err);
+ if(err) goto exit;
+ err = esp_module_init();
+ if(err) wprintf("> esp_module_init err=%d\n", err);
+ if(err) goto exit;
+ err = vnet_init();
+ if(err) wprintf("> vnet_init err=%d\n", err);
+ if(err) goto exit;
+ sa_algorithm_probe_all();
+ err = sa_table_init();
+ if(err) wprintf("> sa_table_init err=%d\n", err);
+ ProcFS_init();
+ exit:
+ if(err < 0){
+ vnet_module_exit();
+ }
+ if(err < 0) wprintf("< err=%d\n", err);
+ return err;
+}
+
+module_init(vnet_module_init);
+module_exit(vnet_module_exit);
+MODULE_LICENSE("GPL");
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef __VNET_VNET_H__
+#define __VNET_VNET_H__
+
+#include <asm/atomic.h>
+#include <linux/skbuff.h>
+
+#include <tunnel.h>
+#include <skb_context.h>
+
+struct Vmac;
+struct Vif;
+struct net_device;
+
+typedef uint32_t vnetid_t;
+typedef uint32_t vnetaddr_t;
+
+/** Vnet property record. */
+typedef struct Vnet {
+ /** Reference count. */
+ atomic_t refcount;
+ /** Vnet id. */
+ vnetid_t vnet;
+ /** Security flag. If true the vnet requires ESP. */
+ int security;
+
+ struct net_device *dev;
+ struct net_device *bridge;
+
+ /** Max size of the header. */
+ int header_n;
+ /** Statistics. */
+ struct net_device_stats stats;
+ int recursion;
+} Vnet;
+
+extern int Vnet_lookup(vnetid_t id, Vnet **vnet);
+extern int Vnet_add(Vnet *vnet);
+extern int Vnet_del(vnetid_t vnet);
+extern void Vnet_incref(Vnet *);
+extern void Vnet_decref(Vnet *);
+extern int Vnet_alloc(Vnet **vnet);
+extern Vnet *vnet_physical;
+
+extern int skb_xmit(struct sk_buff *skb);
+extern int vnet_skb_send(struct sk_buff *skb, u32 vnet);
+extern int vnet_skb_recv(struct sk_buff *skb, u32 vnet, struct Vmac *vmac);
+
+extern int vnet_check_context(int vnet, SkbContext *context, Vnet **vinfo);
+
+extern int vnet_tunnel_open(vnetid_t vnet, vnetaddr_t addr, Tunnel **tunnel);
+extern int vnet_tunnel_lookup(vnetid_t vnet, vnetaddr_t addr, Tunnel **tunnel);
+extern int vnet_tunnel_send(vnetid_t vnet, vnetaddr_t addr, struct sk_buff *skb);
+
+extern int vnet_init(void);
+
+enum {
+ HANDLE_OK = 1,
+ HANDLE_NO = 0,
+};
+
+extern int vnet_sa_security(u32 spi, int protocol, u32 addr);
+struct SAState;
+extern int vnet_sa_create(u32 spi, int protocol, u32 addr, struct SAState **sa);
+
+enum {
+ VNET_PHYS = 1,
+ VNET_VIF = 2,
+};
+
+#endif /* !__VNET_VNET_H__ */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+
+#include <net/ip.h>
+#include <net/protocol.h>
+
+#include <linux/if_arp.h>
+#include <linux/in6.h>
+#include <linux/inetdevice.h>
+#include <linux/arcdevice.h>
+#include <linux/if_bridge.h>
+
+#include <etherip.h>
+#include <vnet.h>
+#include <varp.h>
+#include <vif.h>
+#include <vnet_dev.h>
+
+#define MODULE_NAME "VNET"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+#define VNETIF_FMT "vnetif%u"
+#define VNETBR_FMT "vnet%u"
+
+#ifndef CONFIG_BRIDGE
+#error Must configure ethernet bridging in Network Options
+#endif
+
+#include <linux/../../net/bridge/br_private.h>
+#define dev_bridge(_dev) ((struct net_bridge *)(_dev)->priv)
+
+static void vnet_dev_destructor(struct net_device *dev){
+ dprintf(">\n");
+ dev->open = NULL;
+ dev->stop = NULL;
+ dev->uninit = NULL;
+ dev->destructor = NULL;
+ dev->hard_start_xmit = NULL;
+ dev->get_stats = NULL;
+ dev->do_ioctl = NULL;
+ dev->change_mtu = NULL;
+
+ dev->tx_timeout = NULL;
+ dev->set_multicast_list = NULL;
+ dev->flags = 0;
+
+ dev->priv = NULL;
+}
+
+static void vnet_dev_uninit(struct net_device *dev){
+ //Vnet *vnet = dev->priv;
+ dprintf(">\n");
+ //dev_put(dev);
+ dprintf("<\n");
+}
+
+static struct net_device_stats *vnet_dev_get_stats(struct net_device *dev){
+ Vnet *vnet = dev->priv;
+ //dprintf(">\n");
+ return &vnet->stats;
+}
+
+static int vnet_dev_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd){
+ int err = 0;
+
+ dprintf(">\n");
+ return err;
+}
+
+static int vnet_dev_change_mtu(struct net_device *dev, int mtu){
+ int err = 0;
+ Vnet *vnet = dev->priv;
+ if (mtu < 68 || mtu > 1500 - vnet->header_n){
+ err = -EINVAL;
+ goto exit;
+ }
+ dev->mtu = mtu;
+ exit:
+ return err;
+}
+
+static int vnet_dev_set_name(struct net_device *dev){
+ int err = 0;
+ Vnet *vnet = (void*)dev->priv;
+
+ dprintf(">\n");
+ dprintf("> vnet=%d\n", vnet->vnet);
+ snprintf(dev->name, IFNAMSIZ - 1, VNETIF_FMT, vnet->vnet);
+ if(__dev_get_by_name(dev->name)){
+ err = -ENOMEM;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+//============================================================================
+#ifdef CONFIG_VNET_BRIDGE
+
+#define BRIDGE DEVICE
+
+void vnet_bridge_fini(Vnet *vnet){
+ if(!vnet) return;
+ if(vnet->bridge){
+ br_del_bridge(vnet->bridge->name);
+ vnet->bridge = NULL;
+ }
+}
+
+/** Create the bridge for a vnet, and add the
+ * vnet interface to it.
+ *
+ * @param vnet vnet
+ * @return 0 on success, error code otherwise
+ */
+int vnet_bridge_init(Vnet *vnet){
+ int err = 0;
+ char bridge[IFNAMSIZ] = {};
+ struct net_bridge *br;
+ vnet->bridge = NULL;
+ snprintf(bridge, IFNAMSIZ - 1, VNETBR_FMT, vnet->vnet);
+ rtnl_lock();
+ err = br_add_bridge(bridge);
+ rtnl_unlock();
+ if(err){
+ dprintf("> Error creating vnet bridge %s: err=%d\n", bridge, err);
+ goto exit;
+ }
+ vnet->bridge = __dev_get_by_name(bridge);
+ if(!vnet->bridge){
+ wprintf("> Vnet bridge %s is null!\n", bridge);
+ err = -EINVAL;
+ goto exit;
+ }
+ br = dev_bridge(vnet->bridge);
+ br->stp_enabled = 0;
+ br->bridge_hello_time = 0;
+ br->hello_time = 0;
+ br->bridge_forward_delay = 0;
+ br->forward_delay = 0;
+ rtnl_lock();
+ err = br_add_if(br, vnet->dev);
+ rtnl_unlock();
+ if(err){
+ dprintf("> Error adding vif %s to vnet bridge %s: err=%d\n",
+ vnet->dev->name, bridge, err);
+ goto exit;
+ }
+ rtnl_lock();
+ dev_open(vnet->dev);
+ dev_open(vnet->bridge);
+ rtnl_unlock();
+ exit:
+ if(err){
+ if(vnet->bridge){
+ rtnl_lock();
+ br_del_bridge(bridge);
+ rtnl_unlock();
+ vnet->bridge = NULL;
+ }
+ }
+ return err;
+}
+
+
+/** Add an interface to the bridge for a vnet.
+ *
+ * @param vnet vnet
+ * @param dev interface
+ * @return 0 on success, error code otherwise
+ */
+int vnet_add_if(Vnet *vnet, struct net_device *dev){
+ int err = 0;
+ struct net_device *brdev;
+
+ dprintf(">\n");
+ if(!vnet->bridge){
+ err = -EINVAL;
+ goto exit;
+ }
+ // Delete the interface from the default bridge.
+ // todo: Really want to delete it from any bridge it's in.
+ if(!vnet_get_device(BRIDGE, &brdev)){
+ rtnl_lock();
+ br_del_if(dev_bridge(brdev), dev);
+ rtnl_unlock();
+ }
+ dprintf("> br_add_if %s %s\n", vnet->bridge->name, dev->name);
+ rtnl_lock();
+ dev_open(dev);
+ dev_open(vnet->bridge);
+ err = br_add_if(dev_bridge(vnet->bridge), dev);
+ rtnl_unlock();
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+int vnet_del_if(Vnet *vnet, struct net_device *dev){
+ int err = 0;
+
+ dprintf(">\n");
+ if(!vnet->bridge){
+ err = -EINVAL;
+ goto exit;
+ }
+ rtnl_lock();
+ br_del_if(dev_bridge(vnet->bridge), dev);
+ rtnl_unlock();
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+
+/** Create the bridge and virtual interface for a vnet.
+ *
+ * @param info vnet
+ * @return 0 on success, error code otherwise
+ */
+int Vnet_create(Vnet *info){
+ int err = 0;
+
+ dprintf("> %u\n", info->vnet);
+ err = vnet_dev_add(info);
+ if(err) goto exit;
+ dprintf("> vnet_bridge_init\n");
+ err = vnet_bridge_init(info);
+ if(err) goto exit;
+ dprintf("> Vnet_add...\n");
+ err = Vnet_add(info);
+ exit:
+ if(err){
+ dprintf("> vnet_bridge_fini...\n");
+ vnet_bridge_fini(info);
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+
+
+/** Remove the net device for a vnet.
+ * Clears the dev field of the vnet.
+ * Safe to call if the vnet or its dev are null.
+ *
+ * @param vnet vnet
+ */
+void vnet_dev_remove(Vnet *vnet){
+ if(!vnet) return;
+ dprintf("> vnet=%u\n", vnet->vnet);
+ if(vnet->bridge){
+ dprintf("> br_del_bridge(%s)\n", vnet->bridge->name);
+ rtnl_lock();
+ br_del_bridge(vnet->bridge->name);
+ rtnl_unlock();
+ vnet->bridge = NULL;
+ }
+ if(vnet->dev){
+ //dev_put(vnet->dev);
+ dprintf("> unregister_netdev(%s)\n", vnet->dev->name);
+ unregister_netdev(vnet->dev);
+ vnet->dev = NULL;
+ }
+ dprintf("<\n");
+}
+
+//============================================================================
+#else
+//============================================================================
+
+/** Create the virtual interface for a vnet.
+ *
+ * @param info vnet
+ * @return 0 on success, error code otherwise
+ */
+int Vnet_create(Vnet *info){
+ int err = 0;
+
+ dprintf("> %u\n", info->vnet);
+ err = vnet_dev_add(info);
+ if(err) goto exit;
+ dprintf("> Vnet_add...\n");
+ err = Vnet_add(info);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+int vnet_add_if(Vnet *vnet, struct net_device *dev){
+ int err = -ENOSYS;
+ return err;
+}
+
+
+int vnet_del_if(Vnet *vnet, struct net_device *dev){
+ int err = 0;
+ return err;
+}
+
+/** Remove the net device for a vnet.
+ * Clears the dev field of the vnet.
+ * Safe to call if the vnet or its dev are null.
+ *
+ * @param vnet vnet
+ */
+void vnet_dev_remove(Vnet *vnet){
+ if(!vnet) return;
+ dprintf("> vnet=%u\n", vnet->vnet);
+ if(vnet->dev){
+ //dev_put(vnet->dev);
+ dprintf("> unregister_netdev(%s)\n", vnet->dev->name);
+ unregister_netdev(vnet->dev);
+ vnet->dev = NULL;
+ }
+ dprintf("<\n");
+}
+#endif
+//============================================================================
+
+static int vnet_dev_open(struct net_device *dev){
+ int err = 0;
+ dprintf(">\n");
+ netif_start_queue(dev);
+ dprintf("<\n");
+ return err;
+}
+
+static int vnet_dev_stop(struct net_device *dev){
+ int err = 0;
+ dprintf(">\n");
+ netif_stop_queue(dev);
+ dprintf("<\n");
+ return err;
+}
+
+static int vnet_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev){
+ int err = 0;
+ Vnet *vnet = dev->priv;
+
+ dprintf("> skb=%p\n", skb);
+ if(vnet->recursion++) {
+ vnet->stats.collisions++;
+ vnet->stats.tx_errors++;
+ wprintf("> recursion!\n");
+ dev_kfree_skb(skb);
+ goto exit;
+ }
+ if(!skb){
+ err = -EINVAL;
+ wprintf("> skb NULL!\n");
+ goto exit;
+ }
+ dprintf("> skb->data=%p skb->mac.raw=%p\n", skb->data, skb->mac.raw);
+ if(skb->mac.raw < skb->data || skb->mac.raw > skb->nh.raw){
+ wprintf("> skb mac duff!\n");
+ skb->mac.raw = skb->data;
+ }
+ //dev->trans_start = jiffies;
+ err = vnet_skb_send(skb, vnet->vnet);
+ if(err < 0){
+ vnet->stats.tx_errors++;
+ } else {
+ vnet->stats.tx_packets++;
+ vnet->stats.tx_bytes += skb->len;
+ }
+ exit:
+ vnet->recursion--;
+ dprintf("<\n");
+ return 0;
+}
+
+void vnet_dev_tx_timeout(struct net_device *dev){
+ dprintf(">\n");
+ //dev->trans_start = jiffies;
+ //netif_wake_queue(dev);
+}
+
+void vnet_dev_set_multicast_list(struct net_device *dev){
+ dprintf(">\n");
+}
+
+static int (*eth_hard_header)(struct sk_buff *skb,
+ struct net_device *dev, unsigned short type,
+ void *daddr, void *saddr, unsigned len) = NULL;
+
+static int vnet_dev_hard_header(struct sk_buff *skb,
+ struct net_device *dev, unsigned short type,
+ void *daddr, void *saddr, unsigned len){
+ int err = 0;
+ dprintf("> skb=%p ethhdr=%p dev=%s len=%u\n",
+ skb, skb->mac.raw, dev->name, len);
+ if(saddr){
+ dprintf("> saddr=" MACFMT "\n", MAC6TUPLE((unsigned char*)saddr));
+ } else {
+ dprintf("> saddr=NULL\n");
+ }
+ if(daddr){
+ dprintf("> daddr=" MACFMT "\n", MAC6TUPLE((unsigned char*)daddr));
+ } else {
+ dprintf("> daddr=NULL\n");
+ }
+ err = eth_hard_header(skb, dev, type, daddr, saddr, len);
+ dprintf("> eth_hard_header=%d\n", err);
+ skb->mac.raw = skb->data;
+ dprintf("> src=" MACFMT " dst=" MACFMT "\n",
+ MAC6TUPLE(skb->mac.ethernet->h_source),
+ MAC6TUPLE(skb->mac.ethernet->h_dest));
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+void vnet_dev_mac(unsigned char *mac){
+ static unsigned val = 1;
+ struct net_device *dev;
+
+ if(vnet_get_device(DEVICE, &dev)){
+ mac[0] = 0xAA;
+ mac[1] = 0xFF;
+ mac[2] = (unsigned char)((val >> 24) & 0xff);
+ mac[3] = (unsigned char)((val >> 16) & 0xff);
+ mac[4] = (unsigned char)((val >> 8) & 0xff);
+ mac[5] = (unsigned char)((val ) & 0xff);
+ val++;
+ } else {
+ memcpy(mac, dev->dev_addr, ETH_ALEN);
+ dev_put(dev);
+ }
+}
+
+static int vnet_dev_init(struct net_device *dev){
+ int err = 0;
+ Vnet *vnet = (void*)dev->priv;
+
+ dprintf(">\n");
+ ether_setup(dev);
+
+ if(!eth_hard_header) eth_hard_header = dev->hard_header;
+ dev->hard_header = vnet_dev_hard_header;
+
+ dev->open = vnet_dev_open;
+ dev->stop = vnet_dev_stop;
+ dev->uninit = vnet_dev_uninit;
+ dev->destructor = vnet_dev_destructor;
+ dev->hard_start_xmit = vnet_dev_hard_start_xmit;
+ dev->get_stats = vnet_dev_get_stats;
+ dev->do_ioctl = vnet_dev_do_ioctl;
+ dev->change_mtu = vnet_dev_change_mtu;
+
+ dev->tx_timeout = vnet_dev_tx_timeout;
+ dev->watchdog_timeo = TX_TIMEOUT;
+ dev->set_multicast_list = vnet_dev_set_multicast_list;
+
+ dev->hard_header_len += vnet->header_n;
+ dev->mtu -= vnet->header_n;
+
+ vnet_dev_mac(dev->dev_addr);
+
+ dev->flags |= IFF_DEBUG;
+ dev->flags |= IFF_PROMISC;
+ dev->flags |= IFF_ALLMULTI;
+
+ dprintf("<\n");
+ return err;
+}
+
+/** Add the interface (net device) for a vnet.
+ * Sets the dev field of the vnet on success.
+ * Does nothing if the vif already has an interface.
+ *
+ * @param vif vif
+ * @return 0 on success, error code otherwise
+ */
+int vnet_dev_add(Vnet *vnet){
+ int err = 0;
+ struct net_device *dev = NULL;
+
+ dprintf("> vnet=%p\n", vnet);
+ if(vnet->dev) goto exit;
+ vnet->header_n = sizeof(struct iphdr) + sizeof(struct etheriphdr);
+ dev = kmalloc(sizeof(struct net_device), GFP_ATOMIC);
+ if(!dev){ err = -ENOMEM; goto exit; }
+ *dev = (struct net_device){};
+ dev->priv = vnet;
+ vnet->dev = dev;
+
+ err = vnet_dev_set_name(dev);
+ if(err) goto exit;
+ vnet_dev_init(dev);
+ dprintf("> name=%s, register_netdev...\n", dev->name);
+ err = register_netdev(dev);
+ dprintf("> register_netdev=%d\n", err);
+ if(err) goto exit;
+ rtnl_lock();
+ dev_open(dev);
+ rtnl_unlock();
+
+ //dev_hold(dev);
+ exit:
+ if(err){
+ if(dev) kfree(dev);
+ vnet->dev = NULL;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef _VNET_VNET_DEV_H_
+#define _VNET_VNET_DEV_H_
+
+struct Vnet;
+struct net_device;
+
+extern int vnet_dev_add(struct Vnet *vnet);
+extern void vnet_dev_remove(struct Vnet *vnet);
+extern int Vnet_create(struct Vnet *info);
+extern int vnet_add_if(struct Vnet *vnet, struct net_device *dev);
+extern int vnet_del_if(struct Vnet *vnet, struct net_device *dev);
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+
+#include <asm/uaccess.h>
+
+#include <linux/slab.h>
+
+#include <linux/proc_fs.h>
+#include <linux/string.h>
+
+#include <linux/net.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+
+#include <sa.h>
+#include "vif.h"
+#include "vnet.h"
+#include "varp.h"
+#include "vnet_dev.h"
+
+#include "sxpr_parser.h"
+#include "iostream.h"
+#include "kernel_stream.h"
+#include "sys_string.h"
+#include "sys_net.h"
+
+#define MODULE_NAME "VNET"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+// Functions to manage vnets.
+/*
+
+Have to rely on ethernet bridging being configured - but we can't rely
+on the kernel interface being available to us (it's not exported @!$"%!).
+
+Create a vnet N:
+- create the vnet device vnetifN: using commands to /proc, kernel api
+- create the vnet bridge vnetN: using brctl in user-space
+- for best results something should keep track of the mapping vnet id <-> bridge name
+
+Add vif device vifD.N to vnet N.
+- domain is configured with vifD.N on bridge vnetN
+- vif script adds vif to bridge using brctl
+- vif script detects that the bridge is a vnet bridge and
+ uses /proc commands to configure the mac on the vnet
+
+Wouldn't be hard to add support for specifying vnet keys(s) in
+the control interface.
+
+*/
+
+ // id vnet id
+ // security security level
+ // ciphersuite: digest, cipher, keys??
+/* Security policy.
+ vnet
+ src: mac
+ dst: mac
+ coa: ip
+ Map vnet x coa -> security (none, auth, conf)
+
+ Policy, e.g.
+ - same subnet x vnet
+ - diff subnet x vnet
+ - some subnet x vnet
+ - some host addr x vnet
+
+ (security (net local) (vnet *) (mode none))
+ (security (net (not local))
+
+ (security (addr, vnet) (local-subnet addr) none)
+ (security (addr, vnet) (not (local-subnet addr)) conf)
+ (security (addr, vnet) (host 15.144.27.80)
+ (security (addr, vnet) (subnet addr 15.144.24.0/24) auth)
+ (security (addr, vnet) t auth)
+
+ (security (addr local) (mode none))
+ (security (addr local/16) (mode none))
+ (security (addr 15.144.0.0/16) (mode auth))
+ (security (addr 15.0.0.0/8) (mode conf))
+ (security (addr *) (mode drop))
+
+ ?Varp security
+ Use esp too - none, auth, conf,
+ Varp sends broadcasts (requests) and unicasts (replies).
+ Uses UDP. Could send over ESP if needed.
+ For bcast don't know where it goes, so security has to be by vnet.
+ For ucast know where it goes, so could do by vnet and addr.
+
+ Similar issue for vnets: know where unicast goes but don't know where
+ bcast goes.
+
+ Simplify: 2 levels
+ local ucast
+ nonlocal ucast, mcast
+
+ (security (local none) (nonlocal conf))
+ (security (local auth) (nonlocal conf))
+
+ VARP security matches vnet security.
+
+ */
+
+/** @file
+ *
+ * Kernel interface to files in /proc.
+ */
+
+#define PROC_ROOT "/proc/"
+#define PROC_ROOT_LEN 6
+#define MODULE_ROOT PROC_ROOT "vnet"
+
+enum {
+ VNET_POLICY = 1,
+};
+
+typedef struct proc_dir_entry ProcEntry;
+typedef struct inode Inode;
+typedef struct file File;
+
+static int proc_open_fn(struct inode *inode, File *file);
+static ssize_t proc_read_fn(File *file, char *buffer, size_t count, loff_t *offset);
+static ssize_t proc_write_fn(File *file, const char *buffer, size_t count, loff_t *offset) ;
+//static int proc_flush_fn(File *file);
+static loff_t proc_lseek_fn(File * file, loff_t offset, int orig);
+static int proc_ioctl_fn(struct inode *inode, File *file, unsigned opcode, unsigned long arg);
+static int proc_release_fn(struct inode *inode, File *file);
+
+static int eval(Sxpr exp);
+
+static int ProcEntry_has_name(ProcEntry *entry, const char *name, int namelen){
+ dprintf("> name=%.*s entry=%.*s\n", namelen, name, entry->namelen, entry->name);
+ if(!entry || !entry->low_ino) return FALSE;
+ if(entry->namelen != namelen) return FALSE;
+ return memcmp(name, entry->name, namelen) == 0;
+}
+
+// Set f->f_error on error?
+// Does interface stop r/w on first error?
+// Is release called after an error?
+//
+
+static struct file_operations proc_file_ops = {
+ //owner: THIS_MODULE,
+ open: proc_open_fn,
+ read: proc_read_fn,
+ write: proc_write_fn,
+ //flush: proc_flush_fn,
+ llseek: proc_lseek_fn,
+ ioctl: proc_ioctl_fn,
+ release: proc_release_fn,
+};
+
+static int proc_get_parser(File *file, Parser **val){
+ int err = 0;
+ Parser *parser = NULL;
+ parser = file->private_data;
+ if(!parser){
+ parser = Parser_new();
+ if(!parser){
+ err = -ENOMEM;
+ goto exit;
+ }
+ file->private_data = parser;
+ }
+ exit:
+ *val = parser;
+ return err;
+}
+
+static int proc_open_fn(Inode *inode, File *file){
+ // User open.
+ // Return errcode or 0 on success.
+ // Can stuff data in file->private_data (void*).
+ // Get entry from
+ //ProcEntry *entry = (ProcEntry *)inode->u.generic_ip;
+ //file->private_data = NULL;
+ // Check for user privilege - deny otherwise.
+ // -EACCESS
+ int err = 0;
+ dprintf(">\n");
+ file->private_data = NULL;
+ return err;
+}
+
+static ssize_t proc_read_fn(File *file, char *buffer,
+ size_t count, loff_t *offset){
+ // User read.
+ // Copy data to user buffer, increment offset by count, return count.
+ dprintf(">\n");
+ count = 0;
+ //if(copy_to_user(buffer, data, count)){
+ // return -EFAULT;
+ //}
+ //*offset += count;
+ return count;
+}
+
+static ssize_t proc_write_fn(File *file, const char *buffer,
+ size_t count, loff_t *offset) {
+ // User write.
+ // Copy data into kernel space from buffer.
+ // Increment offset by count, return count (or code).
+ int err = 0;
+ char *data = NULL;
+ Parser *parser = NULL;
+
+ //dprintf("> count=%d\n", count);
+ err = proc_get_parser(file, &parser);
+ if(err) goto exit;
+ data = allocate(count);
+ if(!data){
+ err = -ENOMEM;
+ goto exit;
+ }
+ err = copy_from_user(data, buffer, count);
+ if(err) goto exit;
+ *offset += count;
+ err = Parser_input(parser, data, count);
+ exit:
+ deallocate(data);
+ err = (err < 0 ? err : count);
+ //dprintf("< err = %d\n", err);
+ return err;
+}
+
+#if 0
+static int proc_flush_fn(File *file){
+ // User flush.
+ int writing = (file->f_flags & O_ACCMODE) == O_WRONLY;
+ int f_count = atomic_read(&file->f_count);
+ if (writing && f_count == 1) {
+ ProcEntry *pentry = (ProcEntry *)file->f_dentry->d_inode->u.generic_ip;
+ // ...
+ }
+ return retval;
+}
+#endif
+
+#ifndef SEEK_SET
+enum {
+ /** Offset from start. */
+ SEEK_SET = 0,
+ /** Offset from current position. */
+ SEEK_CUR = 1,
+ /** Offset from size of file. */
+ SEEK_END = 2
+};
+#endif /* !SEEK_SET */
+
+static loff_t proc_lseek_fn(File * file, loff_t offset, int from){
+ // User lseek.
+ dprintf(">\n");
+ switch(from){
+ case SEEK_SET:
+ break;
+ case SEEK_CUR:
+ offset += file->f_pos;
+ break;
+ case SEEK_END:
+ return -EINVAL;
+ default:
+ return -EINVAL;
+ }
+ if(offset < 0) return -EINVAL;
+ file->f_pos = offset;
+ return offset;
+}
+
+static int proc_ioctl_fn(Inode *inode, File *file,
+ unsigned opcode, unsigned long arg){
+ // User ioctl.
+ dprintf(">\n");
+ return 0;
+}
+
+static int proc_release_fn(Inode *inode, File *file){
+ // User close.
+ // Cleanup file->private_data, return errcode.
+ int err = 0;
+ Parser *parser = NULL;
+ Sxpr obj, l;
+
+ dprintf(">\n");
+ err = proc_get_parser(file, &parser);
+ if(err) goto exit;
+ err = Parser_input(parser, NULL, 0);
+ if(err) goto exit;
+ obj = parser->val;
+ objprint(iostdout, obj, 0); IOStream_print(iostdout, "\n");
+ for(l = obj; CONSP(l); l = CDR(l)){
+ err = eval(CAR(l));
+ if(err) break;
+ }
+ exit:
+ Parser_free(parser);
+ file->private_data = NULL;
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+static ProcEntry *proc_fs_root = &proc_root;
+
+static int proc_path_init(const char *path, const char **rest){
+ int err = 0;
+
+ if(!path){
+ err = -EINVAL;
+ goto exit;
+ }
+ if(*path == '/'){
+ if(strncmp(PROC_ROOT, path, PROC_ROOT_LEN)){
+ err = -EINVAL;
+ } else {
+ path += PROC_ROOT_LEN;
+ }
+ }
+ exit:
+ *rest = path;
+ return err;
+}
+
+
+/** Parse a path relative to `dir'. If dir is null or the proc root
+ * the path is relative to "/proc/", and the leading "/proc/" may be
+ * supplied.
+ *
+ */
+static ProcEntry * ProcFS_lookup(const char *path, ProcEntry *dir){
+ const char *pathptr = path, *next = NULL;
+ ProcEntry *entry, *result = NULL;
+ int pathlen;
+
+ if(dir && (dir != proc_fs_root)){
+ entry = dir;
+ } else {
+ if(proc_path_init(path, &pathptr)) goto exit;
+ entry = proc_fs_root;
+ }
+ if(!pathptr || !*pathptr) goto exit;
+ while(1){
+ next = strchr(pathptr, '/');
+ pathlen = (next ? next - pathptr : strlen(pathptr));
+ for(entry = entry->subdir; entry ; entry = entry->next) {
+ if(ProcEntry_has_name(entry, pathptr, pathlen)) break;
+ }
+ if (!entry) break;
+ if(!next){
+ result = entry;
+ break;
+ }
+ pathptr = next + 1;
+ }
+ exit:
+ return result;
+}
+
+static ProcEntry *ProcFS_register(const char *name, ProcEntry *dir, int val){
+ mode_t mode = 0;
+ ProcEntry *entry;
+
+ entry = create_proc_entry(name, mode, dir);
+ if(entry){
+ entry->proc_fops = &proc_file_ops;
+ entry->data = (void*)val; // Whatever data we need.
+ }
+ return entry;
+}
+
+static ProcEntry *ProcFS_mkdir(const char *name, ProcEntry *parent){
+ ProcEntry *entry = NULL;
+ entry = ProcFS_lookup(name, parent);
+ if(!entry){
+ const char *path;
+ if(proc_path_init(name, &path)) goto exit;
+ entry = proc_mkdir(path, parent);
+ }
+ exit:
+ return entry;
+}
+
+static void ProcFS_remove(const char *name, ProcEntry *parent){
+ remove_proc_entry(name, parent);
+}
+
+static void ProcFS_rmrec_entry(ProcEntry *entry){
+ if(entry){
+ // Don't want to remove /proc itself!
+ if(entry->parent == entry) return;
+ while(entry->subdir){
+ ProcFS_rmrec_entry(entry->subdir);
+ }
+ dprintf("> remove %s\n", entry->name);
+ ProcFS_remove(entry->name, entry->parent);
+ }
+}
+
+static void ProcFS_rmrec(const char *name, ProcEntry *parent){
+ ProcEntry *entry;
+
+ dprintf("> name=%s\n", name);
+ entry = ProcFS_lookup(name, parent);
+ if(entry){
+ ProcFS_rmrec_entry(entry);
+ }
+ dprintf("<\n");
+}
+
+static int stringof(Sxpr exp, char **s){
+ int err = 0;
+ if(ATOMP(exp)){
+ *s = atom_name(exp);
+ } else if(STRINGP(exp)){
+ *s = string_string(exp);
+ } else {
+ err = -EINVAL;
+ *s = NULL;
+ }
+ return err;
+}
+
+static int child_string(Sxpr exp, Sxpr key, char **s){
+ int err = 0;
+ Sxpr val = sxpr_child_value(exp, key, ONONE);
+ err = stringof(val, s);
+ return err;
+}
+
+static int intof(Sxpr exp, int *v){
+ int err = 0;
+ char *s;
+ unsigned long l;
+ if(INTP(exp)){
+ *v = OBJ_INT(exp);
+ } else {
+ err = stringof(exp, &s);
+ if(err) goto exit;
+ err = convert_atoul(s, &l);
+ *v = (int)l;
+ }
+ exit:
+ return err;
+}
+
+static int child_int(Sxpr exp, Sxpr key, int *v){
+ int err = 0;
+ Sxpr val = sxpr_child_value(exp, key, ONONE);
+ err = intof(val, v);
+ return err;
+}
+
+static int macof(Sxpr exp, unsigned char *v){
+ int err = 0;
+ char *s;
+ err = stringof(exp, &s);
+ if(err) goto exit;
+ err = mac_aton(s, v);
+ exit:
+ return err;
+}
+
+static int child_mac(Sxpr exp, Sxpr key, unsigned char *v){
+ int err = 0;
+ Sxpr val = sxpr_child_value(exp, key, ONONE);
+ err = macof(val, v);
+ return err;
+}
+
+static int addrof(Sxpr exp, uint32_t *v){
+ int err = 0;
+ char *s;
+ unsigned long w;
+ err = stringof(exp, &s);
+ if(err) goto exit;
+ err = get_inet_addr(s, &w);
+ if(err) goto exit;
+ *v = (uint32_t)w;
+ exit:
+ return err;
+}
+
+static int child_addr(Sxpr exp, Sxpr key, uint32_t *v){
+ int err = 0;
+ Sxpr val = sxpr_child_value(exp, key, ONONE);
+ err = addrof(val, v);
+ return err;
+}
+
+/** Create a vnet.
+ * It is an error if a vnet with the same id exists.
+ *
+ * @param vnet vnet id
+ * @param security security level
+ * @return 0 on success, error code otherwise
+ */
+static int ctrl_vnet_add(int vnet, int security){
+ int err = 0;
+ Vnet *vnetinfo = NULL;
+ if(Vnet_lookup(vnet, &vnetinfo) == 0){
+ err = -EEXIST;
+ goto exit;
+ }
+ err = Vnet_alloc(&vnetinfo);
+ if(err) goto exit;
+ vnetinfo->vnet = vnet;
+ vnetinfo->security = security;
+ err = Vnet_create(vnetinfo);
+ exit:
+ if(vnetinfo) Vnet_decref(vnetinfo);
+ return err;
+}
+
+/** Delete a vnet.
+ *
+ * @param vnet vnet id
+ * @return 0 on success, error code otherwise
+ */
+static int ctrl_vnet_del(int vnet){
+ int err = -ENOSYS;
+ // Can't delete if there are any vifs on the vnet.
+ //Vnet_del(vnet);
+ return err;
+}
+
+/** Create an entry for a vif with the given vnet and vmac.
+ *
+ * @param vnet vnet id
+ * @param vmac mac address
+ * @return 0 on success, error code otherwise
+ */
+static int ctrl_vif_add(int vnet, Vmac *vmac){
+ int err = 0;
+ Vnet *vnetinfo = NULL;
+ Vif *vif = NULL;
+
+ dprintf(">\n");
+ err = Vnet_lookup(vnet, &vnetinfo);
+ if(err) goto exit;
+ err = vif_add(vnet, vmac, &vif);
+ exit:
+ if(vnetinfo) Vnet_decref(vnetinfo);
+ if(vif) vif_decref(vif);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Add net device 'vifname' to the bridge for 'vnet' and
+ * create an entry for a vif with the given vnet and vmac.
+ * This is used when device 'vifname' is a virtual device
+ * connected to a vif in a vm.
+ *
+ * @param vifname name of device to bridge
+ * @param vnet vnet id
+ * @param vmac mac address
+ * @return 0 on success, error code otherwise
+ */
+static int ctrl_vif_conn(char *vifname, int vnet, Vmac *vmac){
+ int err = 0;
+ Vnet *vnetinfo = NULL;
+ struct net_device *vifdev = NULL;
+ Vif *vif = NULL;
+
+ dprintf("> %s\n", vifname);
+ err = Vnet_lookup(vnet, &vnetinfo);
+ if(err) goto exit;
+ err = vif_add(vnet, vmac, &vif);
+ if(err) goto exit;
+ err = vnet_get_device(vifname, &vifdev);
+ if(err) goto exit;
+ vif->dev = vifdev;
+ err = vnet_add_if(vnetinfo, vifdev);
+ exit:
+ if(vnetinfo) Vnet_decref(vnetinfo);
+ if(vif) vif_decref(vif);
+ if(vifdev) dev_put(vifdev);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Delete a vif.
+ *
+ * @param vnet vnet id
+ * @param vmac mac address
+ * @return 0 on success, error code otherwise
+ */
+static int ctrl_vif_del(int vnet, Vmac *vmac){
+ int err = 0;
+ Vnet *vnetinfo = NULL;
+ Vif *vif = NULL;
+
+ dprintf(">\n");
+ err = Vnet_lookup(vnet, &vnetinfo);
+ if(err) goto exit;
+ err = vif_lookup(vnet, vmac, &vif);
+ if(err) goto exit;
+ if(vif->dev){
+ vnet_del_if(vnetinfo, vif->dev);
+ vif->dev = NULL;
+ }
+ vif_remove(vnet, vmac);
+ exit:
+ if(vnetinfo) Vnet_decref(vnetinfo);
+ if(vif) vif_decref(vif);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** (varp.print)
+ */
+static int eval_varp_print(Sxpr exp){
+ int err = 0;
+ varp_print();
+ return err;
+}
+
+/** (varp.mcaddr (addr <addr>))
+ */
+static int eval_varp_mcaddr(Sxpr exp){
+ int err =0;
+ Sxpr oaddr = intern("addr");
+ uint32_t addr;
+
+ err = child_addr(exp, oaddr, &addr);
+ if(err < 0) goto exit;
+ varp_set_mcast_addr(addr);
+ exit:
+ return err;
+}
+
+/** (vnet.add (id <id>) [(security { none | auth | conf } )] )
+ */
+static int eval_vnet_add(Sxpr exp){
+ int err = 0;
+ Sxpr oid = intern("id");
+ Sxpr osecurity = intern("security");
+ Sxpr csecurity;
+ int id;
+ char *security;
+ int sec;
+ err = child_int(exp, oid, &id);
+ if(err) goto exit;
+ if(id < VNET_VIF){
+ err = -EINVAL;
+ goto exit;
+ }
+ csecurity = sxpr_child_value(exp, osecurity, intern("none"));
+ err = stringof(csecurity, &security);
+ if(err) goto exit;
+ if(strcmp(security, "none")==0){
+ sec = 0;
+ } else if(strcmp(security, "auth")==0){
+ sec = SA_AUTH;
+ } else if(strcmp(security, "conf")==0){
+ sec = SA_CONF;
+ } else {
+ err = -EINVAL;
+ goto exit;
+ }
+ dprintf("> vnet id=%d\n", id);
+ err = ctrl_vnet_add(id, sec);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Delete a vnet.
+ *
+ * (vnet.del (id <id>))
+ *
+ * @param vnet vnet id
+ * @return 0 on success, error code otherwise
+ */
+static int eval_vnet_del(Sxpr exp){
+ int err = 0;
+ Sxpr oid = intern("id");
+ int id;
+
+ err = child_int(exp, oid, &id);
+ if(err) goto exit;
+ err = ctrl_vnet_del(id);
+ exit:
+ return err;
+}
+
+/** (vif.add (vnet <vnet>) (vmac <macaddr>))
+ */
+static int eval_vif_add(Sxpr exp){
+ int err = 0;
+ Sxpr ovnet = intern("vnet");
+ Sxpr ovmac = intern("vmac");
+ int vnet;
+ Vmac vmac = {};
+
+ err = child_int(exp, ovnet, &vnet);
+ if(err) goto exit;
+ err = child_mac(exp, ovmac, vmac.mac);
+ if(err) goto exit;
+ err = ctrl_vif_add(vnet, &vmac);
+ exit:
+ return err;
+}
+
+/** (vif.conn (vif <name>) (vnet <id>) (vmac <mac>))
+ */
+static int eval_vif_conn(Sxpr exp){
+ int err = 0;
+ Sxpr ovif = intern("vif");
+ Sxpr ovnet = intern("vnet");
+ Sxpr ovmac = intern("vmac");
+ char *vif = NULL;
+ int vnet = 0;
+ Vmac vmac = {};
+
+ err = child_string(exp, ovif, &vif);
+ if(err) goto exit;
+ err = child_int(exp, ovnet, &vnet);
+ if(err) goto exit;
+ err = child_mac(exp, ovmac, vmac.mac);
+ dprintf("> connect vif=%s vnet=%d\n", vif, vnet);
+ err = ctrl_vif_conn(vif, vnet, &vmac);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** (vif.del (vnet <vnet>) (vmac <macaddr>))
+ */
+static int eval_vif_del(Sxpr exp){
+ int err = 0;
+ Sxpr ovnet = intern("vnet");
+ Sxpr ovmac = intern("vmac");
+ int vnet;
+ Vmac vmac = {};
+
+ err = child_int(exp, ovnet, &vnet);
+ if(err) goto exit;
+ err = child_mac(exp, ovmac, vmac.mac);
+ if(err) goto exit;
+ err = ctrl_vif_del(vnet, &vmac);
+ exit:
+ return err;
+}
+
+typedef struct SxprEval {
+ Sxpr elt;
+ int (*fn)(Sxpr);
+} SxprEval;
+
+static int eval(Sxpr exp){
+ int err = 0;
+ SxprEval defs[] = {
+ { intern("varp.print"), eval_varp_print },
+ { intern("varp.mcaddr"), eval_varp_mcaddr },
+ { intern("vif.add"), eval_vif_add },
+ { intern("vif.conn"), eval_vif_conn },
+ { intern("vif.del"), eval_vif_del },
+ { intern("vnet.add"), eval_vnet_add },
+ { intern("vnet.del"), eval_vnet_del },
+ { ONONE, NULL } };
+ SxprEval *def;
+
+ dprintf(">\n");
+ err = -EINVAL;
+ for(def = defs; !NONEP(def->elt); def++){
+ if(sxpr_elementp(exp, def->elt)){
+ err = def->fn(exp);
+ break;
+ }
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+void __init ProcFS_init(void){
+ ProcEntry *root_entry;
+ ProcEntry *policy_entry;
+
+ dprintf(">\n");
+ root_entry = ProcFS_mkdir(MODULE_ROOT, NULL);
+ if(!root_entry) goto exit;
+ policy_entry = ProcFS_register("policy", root_entry, VNET_POLICY);
+ exit:
+ dprintf("<\n");
+}
+
+void __exit ProcFS_exit(void){
+ dprintf(">\n");
+ ProcFS_rmrec(MODULE_ROOT, NULL);
+ dprintf("<\n");
+}
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free software Foundation, Inc.,
+ * 59 Temple Place, suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+#ifndef _VNET_VNET_IOCTL_H_
+#define _VNET_VNET_IOCTL_H_
+
+extern void ProcFS_init(void);
+extern void ProcFS_exit(void);
+
+#endif /* ! _VNET_VNET_IOCTL_H_ */
--- /dev/null
+# -*- mode: Makefile; -*-
+#----------------------------------------------------------------------------
+# Copyright (C) 2004 Mike Wray <mike.wray@hp.com>.
+#
+# This library is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as
+# published by the Free Software Foundation; either version 2.1 of the
+# License, or (at your option) any later version. This library is
+# distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.
+# See the GNU Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this library; if not, write to the Free Software Foundation,
+# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#----------------------------------------------------------------------------
+
+all: vnetd
+
+#----------------------------------------------------------------------------
+
+XEN_ROOT = ../../..
+include $(XEN_ROOT)/tools/Make.defs
+
+VNETD_INSTALL_DIR = /usr/sbin
+
+LIB_DIR = $(XEN_LIBXUTIL)
+VNET_DIR = ../vnet-module
+
+INCLUDES += -I$(LIB_DIR)
+INCLUDES += -I$(VNET_DIR)
+
+#----------------------------------------------------------------------------
+# GC.
+GC_DIR:=../gc/install
+GC_INCLUDE:= $(GC_DIR)/include
+GC_LIB_DIR:=$(GC_DIR)/lib
+
+INCLUDES += -I$(GC_INCLUDE)
+#LIBS += -L$(GC_LIB_DIR)
+CPPFLAGS += -D USE_GC
+
+#----------------------------------------------------------------------------
+CFLAGS += -g
+CFLAGS += -Wall
+CFLAGS += $(INCLUDES) $(LIBS)
+
+LDFLAGS += $(LIBS)
+
+# Dependencies. Gcc generates them for us.
+CFLAGS += -Wp,-MD,.$(@F).d
+PROG_DEP = .*.d
+
+vpath %.c $(LIB_DIR)
+
+IPATHS:=$(INCLUDES:-I=)
+vpath %.h $(IPATHS)
+
+#----------------------------------------------------------------------------
+VNETD_SRC:=
+VNETD_SRC+= connection.c
+VNETD_SRC+= marshal.c
+VNETD_SRC+= select.c
+VNETD_SRC+= timer.c
+VNETD_SRC+= vcache.c
+VNETD_SRC+= vnetd.c
+
+LIB_SRC:=
+LIB_SRC+= allocate.c
+LIB_SRC+= enum.c
+LIB_SRC+= file_stream.c
+LIB_SRC+= hash_table.c
+LIB_SRC+= iostream.c
+LIB_SRC+= lexis.c
+LIB_SRC+= socket_stream.c
+LIB_SRC+= string_stream.c
+LIB_SRC+= sxpr.c
+LIB_SRC+= sys_net.c
+LIB_SRC+= sys_string.c
+LIB_SRC+= util.c
+
+VNETD_SRC+=$(LIB_SRC)
+
+VNETD_OBJ := $(VNETD_SRC:.c=.o)
+
+#VNETD_LIBS:= $(GC_LIB_DIR)/libgc.so.1.0.2
+#VNETD_LIBS:= -lgc
+VNETD_LIBS:= $(GC_LIB_DIR)/libgc.a
+
+vnetd: $(VNETD_OBJ)
+ $(CC) $(CFLAGS) -o $@ $^ $(VNETD_LIBS) -ldl -lpthread
+
+install: vnetd
+ mkdir -p $(prefix)/$(VNETD_INSTALL_DIR)
+ install -m 0755 vnetd $(prefix)/$(VNETD_INSTALL_DIR)
+
+clean:
+ -rm -f *.a *.o *~
+ -rm -f vnetd
+ -rm -f $(PROG_DEP)
+
+-include $(PROG_DEP)
--- /dev/null
+/*
+ * Copyright (C) 2003 - 2004 Mike Wray <mike.wray@hp.com>.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version. This library is
+ * distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include "allocate.h"
+#include "connection.h"
+#include "file_stream.h"
+#include "socket_stream.h"
+
+#define DEBUG
+#undef DEBUG
+#define MODULE_NAME "conn"
+#include "debug.h"
+
+/** Initialize a file stream from a file desciptor.
+ *
+ * @param fd file descriptor
+ * @param mode file mode
+ * @param buffered make the stream buffered if 1, unbuffered if 0
+ * @param io return parameter for the stream
+ * @return 0 on success, error code otherwise
+ */
+static int stream_init(int fd, const char *mode, int buffered, IOStream **io){
+ int err = 0;
+ *io = file_stream_fdopen(fd, mode);
+ if(!*io){
+ err = -errno;
+ perror("fdopen");
+ goto exit;
+ }
+ if(!buffered){
+ // Make unbuffered.
+ err = file_stream_setvbuf(*io, NULL, _IONBF, 0);
+ if(err){
+ err = -errno;
+ perror("setvbuf");
+ goto exit;
+ }
+ }
+ exit:
+ if(err && *io){
+ IOStream_close(*io);
+ *io = NULL;
+ }
+ return err;
+}
+
+ConnList * ConnList_add(Conn *conn, ConnList *l){
+ ConnList *v;
+ v = ALLOCATE(ConnList);
+ v->conn = conn;
+ v->next =l;
+ return v;
+}
+
+Conn *Conn_new(int (*fn)(Conn *), void *data){
+ Conn *conn;
+ conn = ALLOCATE(Conn);
+ conn->fn = fn;
+ conn->data = data;
+ return conn;
+}
+
+int Conn_handle(Conn *conn){
+ int err = 0;
+ dprintf(">\n");
+ if(conn->fn){
+ err = conn->fn(conn);
+ } else {
+ dprintf("> no handler\n");
+ err = -ENOSYS;
+ }
+ if(err < 0){
+ Conn_close(conn);
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Initialize a connection.
+ *
+ * @param conn connection
+ * @param sock socket
+ * @param ipaddr ip address
+ * @return 0 on success, error code otherwise
+ */
+int Conn_init(Conn *conn, int sock, int type, struct sockaddr_in addr){
+ int err = 0;
+ conn->addr = addr;
+ conn->type = type;
+ conn->sock = sock;
+ if(type == SOCK_STREAM){
+ err = stream_init(sock, "r", 0, &conn->in);
+ if(err) goto exit;
+ err = stream_init(sock, "w", 0, &conn->out);
+ if(err) goto exit;
+ } else {
+ conn->in = socket_stream_new(sock);
+ conn->out = socket_stream_new(sock);
+ socket_stream_set_addr(conn->out, &addr);
+ }
+ exit:
+ if(err) eprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Open a connection.
+ *
+ * @param conn connection
+ * @param socktype socket type
+ * @param ipaddr ip address to connect to
+ * @param port port
+ * @return 0 on success, error code otherwise
+ */
+int Conn_connect(Conn *conn, int socktype, struct in_addr ipaddr, uint16_t port){
+ int err = 0;
+ int sock;
+ struct sockaddr_in addr_in;
+ struct sockaddr *addr = (struct sockaddr *)&addr_in;
+ socklen_t addr_n = sizeof(addr_in);
+ dprintf("> addr=%s:%d\n", inet_ntoa(ipaddr), ntohs(port));
+ sock = socket(AF_INET, socktype, 0);
+ if(sock < 0){
+ err = -errno;
+ goto exit;
+ }
+ addr_in.sin_family = AF_INET;
+ addr_in.sin_addr = ipaddr;
+ addr_in.sin_port = port;
+ err = connect(sock, addr, addr_n);
+ if(err) goto exit;
+ err = Conn_init(conn, sock, socktype, addr_in);
+ exit:
+ if(err) eprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Close a connection.
+ *
+ * @param conn connection
+ */
+void Conn_close(Conn *conn){
+ if(!conn) return;
+ if(conn->in) IOStream_close(conn->in);
+ if(conn->out) IOStream_close(conn->out);
+ shutdown(conn->sock, 2);
+}
--- /dev/null
+/*
+ * Copyright (C) 2003 - 2004 Mike Wray <mike.wray@hp.com>.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version. This library is
+ * distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _VNET_CONNECTION_H_
+#define _VNET_CONNECTION_H_
+
+#include <netinet/in.h>
+
+#include "iostream.h"
+
+/** A connection.
+ * The underlying transport is a socket.
+ * Contains in and out streams using the socket.
+ */
+typedef struct Conn {
+ struct sockaddr_in addr;
+ int sock;
+ int type;
+ IOStream *in;
+ IOStream *out;
+ int (*fn)(struct Conn *);
+ void *data;
+} Conn;
+
+typedef struct ConnList {
+ Conn *conn;
+ struct ConnList *next;
+} ConnList;
+
+extern ConnList * ConnList_add(Conn *conn, ConnList *l);
+
+extern Conn * Conn_new(int (*fn)(struct Conn *), void *data);
+extern int Conn_init(Conn *conn, int sock, int type, struct sockaddr_in addr);
+extern int Conn_connect(Conn *conn, int type, struct in_addr ipaddr, uint16_t port);
+extern int Conn_handle(Conn *conn);
+extern void Conn_close(Conn *conn);
+
+#endif /* ! _VNET_CONNECTION_H_ */
--- /dev/null
+/*
+ * Copyright (C) 2001 - 2004 Mike Wray <mike.wray@hp.com>.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version. This library is
+ * distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <errno.h>
+#include "sys_net.h"
+#include "allocate.h"
+#include "marshal.h"
+
+#define MODULE_NAME "marshal"
+#define DEBUG
+#undef DEBUG
+#include "debug.h"
+
+#define ARRAY_SIZE(ary) (sizeof(ary)/sizeof((ary)[0]))
+
+/* Messages are coded as msgid followed by message fields.
+ * Initial message on any channel is hello - so can check version
+ * compatibility.
+ *
+ * char* -> uint16_t:n <n bytes>
+ * ints/uints go as suitable number of bytes (e.g. uint16_t is 2 bytes).
+ * optional fields go as '1' <val> or '0' (the 0/1 is 1 byte).
+ * lists go as ('1' <elt>)* '0'
+ */
+
+int marshal_flush(IOStream *io){
+ int err = 0;
+ err = IOStream_flush(io);
+ return err;
+}
+
+int marshal_bytes(IOStream *io, void *s, uint32_t s_n){
+ int err = 0;
+ int n;
+ n = IOStream_write(io, s, s_n);
+ if(n < 0){
+ err = n;
+ } else if (n < s_n){
+ dprintf("> Wanted %d, got %d\n", s_n, n);
+ err = -EIO;
+ }
+ return err;
+}
+
+int unmarshal_bytes(IOStream *io, void *s, uint32_t s_n){
+ int err = 0;
+ int n;
+ //dprintf("> s_n=%d\n", s_n);
+ n = IOStream_read(io, s, s_n);
+ //dprintf("> n=%d\n", n);
+ if(n < 0){
+ err = n;
+ } else if(n < s_n){
+ dprintf("> Wanted %d, got %d\n", s_n, n);
+ err = -EIO;
+ }
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+int marshal_uint8(IOStream *io, uint8_t x){
+ return marshal_bytes(io, &x, sizeof(x));
+}
+
+int unmarshal_uint8(IOStream *io, uint8_t *x){
+ return unmarshal_bytes(io, x, sizeof(*x));
+}
+
+int marshal_uint16(IOStream *io, uint16_t x){
+ x = htons(x);
+ return marshal_bytes(io, &x, sizeof(x));
+}
+
+int unmarshal_uint16(IOStream *io, uint16_t *x){
+ int err = 0;
+ err = unmarshal_bytes(io, x, sizeof(*x));
+ *x = ntohs(*x);
+ return err;
+}
+
+int marshal_int32(IOStream *io, int32_t x){
+ int err = 0;
+ //dprintf("> x=%d\n", x);
+ x = htonl(x);
+ err = marshal_bytes(io, &x, sizeof(x));
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+int unmarshal_int32(IOStream *io, int32_t *x){
+ int err = 0;
+ //dprintf(">\n");
+ err = unmarshal_bytes(io, x, sizeof(*x));
+ *x = ntohl(*x);
+ //dprintf("< err=%d x=%d\n", err, *x);
+ return err;
+}
+
+int marshal_uint32(IOStream *io, uint32_t x){
+ int err = 0;
+ //dprintf("> x=%u\n", x);
+ x = htonl(x);
+ err = marshal_bytes(io, &x, sizeof(x));
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+int unmarshal_uint32(IOStream *io, uint32_t *x){
+ int err = 0;
+ //dprintf(">\n");
+ err = unmarshal_bytes(io, x, sizeof(*x));
+ *x = ntohl(*x);
+ //dprintf("< err=%d x=%u\n", err, *x);
+ return err;
+}
+
+int marshal_uint64(IOStream *io, uint64_t x){
+ int err;
+ err = marshal_uint32(io, (uint32_t) ((x >> 32) & 0xffffffff));
+ if(err) goto exit;
+ err = marshal_uint32(io, (uint32_t) ( x & 0xffffffff));
+ exit:
+ return err;
+}
+
+int unmarshal_uint64(IOStream *io, uint64_t *x){
+ int err = 0;
+ uint32_t hi, lo;
+ err = unmarshal_uint32(io, &hi);
+ if(err) goto exit;
+ err = unmarshal_uint32(io, &lo);
+ *x = (((uint64_t) hi) << 32) | lo;
+ exit:
+ return err;
+}
+
+int marshal_net16(IOStream *io, net16_t x){
+ return marshal_bytes(io, &x, sizeof(x));
+}
+
+int unmarshal_net16(IOStream *io, net16_t *x){
+ int err = 0;
+ err = unmarshal_bytes(io, x, sizeof(*x));
+ return err;
+}
+
+int marshal_net32(IOStream *io, net32_t x){
+ return marshal_bytes(io, &x, sizeof(x));
+}
+
+int unmarshal_net32(IOStream *io, net32_t *x){
+ int err = 0;
+ err = unmarshal_bytes(io, x, sizeof(*x));
+ return err;
+}
+
+int marshal_string(IOStream *io, char *s, uint32_t s_n){
+ int err;
+ //dprintf("> s=%s\n", s);
+ err = marshal_uint32(io, s_n);
+ if(err) goto exit;
+ err = marshal_bytes(io, s, s_n);
+ exit:
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+int unmarshal_string(IOStream *io, char *s, uint32_t s_n){
+ int err = 0, val_n = 0;
+ //dprintf(">\n");
+ err = unmarshal_uint32(io, &val_n);
+ if(err) goto exit;
+ if(val_n >= s_n){
+ err = -EINVAL;
+ goto exit;
+ }
+ err = unmarshal_bytes(io, s, val_n);
+ if(err) goto exit;
+ s[val_n] = '\0';
+ exit:
+ //dprintf("< err=%d s=%s\n", err, s);
+ return err;
+}
+
+int unmarshal_new_string(IOStream *io, char **s, uint32_t *s_n){
+ int err = 0, val_n = 0;
+ char *val = NULL;
+ //dprintf(">\n");
+ err = unmarshal_uint32(io, &val_n);
+ if(err) goto exit;
+ val = allocate(val_n + 1);
+ if(!val){
+ err = -ENOMEM;
+ goto exit;
+ }
+ err = unmarshal_bytes(io, val, val_n);
+ if(err) goto exit;
+ val[val_n] = '\0';
+ exit:
+ if(err){
+ if(val) deallocate(val);
+ val = NULL;
+ val_n = 0;
+ }
+ *s = val;
+ if(s_n) *s_n = val_n;
+ //dprintf("< err=%d s=%s\n", err, *s);
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version. This library is
+ * distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _XEN_LIB_MARSHAL_H_
+#define _XEN_LIB_MARSHAL_H_
+
+#include "iostream.h"
+
+/** A 16-bit uint in network order, e.g. a port number. */
+typedef uint16_t net16_t;
+
+/** A 32-bit uint in network order, e.g. an IP address. */
+typedef uint32_t net32_t;
+
+extern int marshal_flush(IOStream *io);
+
+extern int marshal_bytes(IOStream *io, void *s, uint32_t s_n);
+extern int unmarshal_bytes(IOStream *io, void *s, uint32_t s_n);
+
+extern int marshal_uint8(IOStream *io, uint8_t x);
+extern int unmarshal_uint8(IOStream *io, uint8_t *x);
+
+extern int marshal_uint16(IOStream *io, uint16_t x);
+extern int unmarshal_uint16(IOStream *io, uint16_t *x);
+
+extern int marshal_uint32(IOStream *io, uint32_t x);
+extern int unmarshal_uint32(IOStream *io, uint32_t *x);
+
+extern int marshal_int32(IOStream *io, int32_t x);
+extern int unmarshal_int32(IOStream *io, int32_t *x);
+
+extern int marshal_uint64(IOStream *io, uint64_t x);
+extern int unmarshal_uint64(IOStream *io, uint64_t *x);
+
+extern int marshal_net16(IOStream *io, net16_t x);
+extern int unmarshal_net16(IOStream *io, net16_t *x);
+
+extern int marshal_net32(IOStream *io, net32_t x);
+extern int unmarshal_net32(IOStream *io, net32_t *x);
+
+extern int marshal_string(IOStream *io, char *s, uint32_t s_n);
+extern int unmarshal_string(IOStream *io, char *s, uint32_t s_n);
+extern int unmarshal_new_string(IOStream *io, char **s, uint32_t *s_n);
+
+#endif /* ! _XEN_LIB_MARSHAL_H_ */
--- /dev/null
+/*
+ * Copyright (C) 2003 - 2004 Mike Wray <mike.wray@hp.com>.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version. This library is
+ * distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdlib.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "select.h"
+
+/** Zero all the file descriptor sets.
+ *
+ * @param set select set
+ * @param fd file descriptor
+ * @return 0 on success, -1 otherwise
+ */
+void SelectSet_zero(SelectSet *set){
+ set->n = 0;
+ FD_ZERO(&set->rd);
+ FD_ZERO(&set->wr);
+ FD_ZERO(&set->er);
+}
+
+/** Add a file descriptor to the write set.
+ *
+ * @param set select set
+ * @param fd file descriptor
+ * @return 0 on success, -1 otherwise
+ */
+void SelectSet_add_read(SelectSet *set, int fd){
+ FD_SET(fd, &set->rd);
+ if(fd > set->n) set->n = fd;
+}
+
+/** Add a file descriptor to the write set.
+ *
+ * @param set select set
+ * @param fd file descriptor
+ * @return 0 on success, -1 otherwise
+ */
+void SelectSet_add_write(SelectSet *set, int fd){
+ FD_SET(fd, &set->wr);
+ if(fd > set->n) set->n = fd;
+}
+
+/** Select on file descriptors.
+ *
+ * @param set select set
+ * @param timeout timeout (may be NULL for no timeout)
+ * @return 0 on success, -1 otherwise
+ */
+int SelectSet_select(SelectSet *set, struct timeval *timeout){
+ return select(set->n+1, &set->rd, &set->wr, &set->er, timeout);
+}
--- /dev/null
+/*
+ * Copyright (C) 2003 - 2004 Mike Wray <mike.wray@hp.com>.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version. This library is
+ * distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _VFC_SELECT_H_
+#define _VFC_SELECT_H_
+
+/** Set of file descriptors for select.
+ */
+typedef struct SelectSet {
+ int n;
+ fd_set rd, wr, er;
+} SelectSet;
+
+extern void SelectSet_zero(SelectSet *set);
+extern void SelectSet_add_read(SelectSet *set, int fd);
+extern void SelectSet_add_write(SelectSet *set, int fd);
+extern int SelectSet_select(SelectSet *set, struct timeval *timeout);
+
+#endif /* ! _VFC_SELECT_H_ */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sys/time.h>
+#include <time.h>
+
+#include "allocate.h"
+#include "timer.h"
+
+#define MODULE_NAME "TIMER"
+#undef DEBUG
+#define DEBUG 1
+#include "debug.h"
+
+static Timer *timers = NULL;
+
+/** Get the time now as a double (in seconds).
+ * Returns zero if could not get the time.
+ *
+ * @return time now
+ */
+double time_now(void){
+ struct timeval time;
+ if(gettimeofday(&time, NULL)) return 0.0;
+ return (double)time.tv_sec + (1.0e-6 * (double)time.tv_usec);
+}
+
+/** Set the process real-time timer to go off at a given expiry time.
+ * The timer will not be set to go off in less than 10 ms
+ * (even if the expiry time is sooner, or in the past).
+ *
+ * @param expiry time (in seconds)
+ * @return 0 on success, error code otherwise
+ */
+static int timer_set(double expiry){
+ struct itimerval val = {};
+ struct itimerval old = {};
+ double now, delay;
+ int err = 0;
+
+ if(expiry == 0.0){
+ val.it_value.tv_sec = 0;
+ val.it_value.tv_usec = 0;
+ } else {
+ now = time_now();
+ delay = expiry - now;
+ if(delay < 0.01) delay = 0.01;
+ val.it_value.tv_sec = (long)delay;
+ val.it_value.tv_usec = (long)((delay - (double)(long)delay) * 1.0e6);
+ }
+ err = setitimer(ITIMER_REAL, &val, &old);
+ return err;
+}
+
+static void Timer_free(Timer *z){
+#ifndef USE_GC
+ if(!z) return;
+ deallocate(z);
+#endif
+}
+
+/** Process any expired timers.
+ * Calls the functions of expired timers and removes them
+ * from the timer list.
+ * Reschedules the interval timer for the earliest expiring timer
+ * (if any).
+ *
+ * Should not be called from within the SIGALRM handler - set
+ * a flag there and call it later.
+ *
+ * @return 0 on success, error code otherwise.
+ */
+int process_timers(void){
+ double now = time_now();
+ Timer *curr, *next;
+ for(curr = timers; curr; curr = next){
+ next = curr->next;
+ if(curr->expiry > now) break;
+ if(curr->fn) curr->fn(curr);
+ Timer_free(curr);
+ }
+ timers = curr;
+ timer_set((curr ? curr->expiry : 0));
+ return 0;
+}
+
+Timer * Timer_set(double delay, TimerFn *fn, void *data){
+ // Get 'now'.
+ double now = time_now();
+ Timer *timer = NULL, *prev, *curr, *next;
+ timer = ALLOCATE(Timer);
+ if(!timer) goto exit;
+ // Add delay to now to get expiry time.
+ timer->expiry = now + delay;
+ timer->fn = fn;
+ timer->data = data;
+
+ // Insert timer in list ordered by (increasing) expiry time.
+ prev = NULL;
+ for(curr = timers; curr; prev = curr, curr = next){
+ next = curr->next;
+ if(timer->expiry < curr->expiry) break;
+ }
+ if(prev){
+ prev->next = timer;
+ } else {
+ timers = timer;
+ }
+ timer->next = curr;
+
+ // Set interval timer to go off for earliest expiry time.
+ timer_set(timer->expiry);
+ exit:
+ return timer;
+}
+
+int Timer_cancel(Timer *timer){
+ // Remove timer from list.
+ int err = -ENOENT;
+ Timer *prev, *curr, *next;
+ for(prev = NULL, curr = timers; curr; prev = curr, curr = next){
+ next = curr->next;
+ if(curr == timer){
+ err = 0;
+ if(prev){
+ prev->next = curr->next;
+ } else {
+ timers = curr->next;
+ }
+ curr->next = NULL;
+ Timer_free(curr);
+ break;
+ }
+ }
+ return err;
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or
+ * (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _VNET_TIMER_H_
+#define _VNET_TIMER_H_
+
+struct Timer;
+
+typedef void TimerFn(struct Timer *);
+
+typedef struct Timer {
+ TimerFn *fn;
+ void *data;
+ double expiry;
+ struct Timer *next;
+} Timer;
+
+extern void timer_alarm(void);
+extern double time_now(void);
+extern int process_timers(void);
+extern Timer * Timer_set(double delay, TimerFn *fn, void *data);
+extern int Timer_cancel(Timer *timer);
+
+#endif /* ! _VNET_TIMER_H_ */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version. This library is
+ * distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include "allocate.h"
+#include "hash_table.h"
+#include "sys_net.h"
+#include "sys_string.h"
+#include "connection.h"
+#include "marshal.h"
+#include "timer.h"
+
+#undef offsetof
+#include "vnetd.h"
+#include "vcache.h"
+
+#define MODULE_NAME "VARP"
+#define DEBUG 1
+#undef DEBUG
+#include "debug.h"
+
+static VarpCache *vcache = NULL;
+
+void IPMessageQueue_init(IPMessageQueue *queue, int maxlen){
+ queue->msg = NULL;
+ queue->len = 0;
+ queue->maxlen = maxlen;
+}
+
+void IPMessageQueue_clear(IPMessageQueue *queue){
+ queue->msg = NULL;
+ queue->len = 0;
+}
+
+void IPMessageQueue_truncate(IPMessageQueue *queue, int n){
+ IPMessage **p = &queue->msg;
+ int i;
+ for(i = 1; *p; p = &(*p)->next, i++){
+ if(i == n){
+ *p = NULL;
+ break;
+ }
+ }
+}
+
+void IPMessageQueue_add(IPMessageQueue *queue, IPMessage *msg){
+ msg->next = queue->msg;
+ queue->msg = msg;
+ queue->len++;
+ if(queue->len >= queue->maxlen){
+ IPMessageQueue_truncate(queue, queue->maxlen);
+ }
+}
+
+IPMessage * IPMessageQueue_pop(IPMessageQueue *queue){
+ IPMessage *msg = NULL;
+ if(queue->len > 0){
+ queue->len--;
+ msg = queue->msg;
+ queue->msg = msg->next;
+ msg->next = NULL;
+ }
+ return msg;
+}
+
+void VarpCache_sweep(VarpCache *z, int all);
+
+/** Send a varp protocol message.
+ *
+ * @param opcode varp opcode (host order)
+ * @param vnet vnet id (in network order)
+ * @param vmac vmac (in network order)
+ * @return 0 on success, error code otherwise
+ */
+int varp_send(Conn *conn, uint16_t opcode, uint32_t vnet, Vmac *vmac, uint32_t addr){
+ int err = 0;
+ int varp_n = sizeof(VarpHdr);
+ VarpHdr varph = {};
+
+ varph.id = htons(VARP_ID);
+ varph.opcode = htons(opcode);
+ varph.vnet = vnet;
+ varph.vmac = *vmac;
+ varph.addr = addr;
+
+ if(0){
+ struct sockaddr_in self;
+ socklen_t self_n;
+ getsockname(conn->sock, (struct sockaddr *)&self, &self_n);
+ dprintf("> sockname addr=%s port=%d\n",
+ inet_ntoa(self.sin_addr), ntohs(self.sin_port));
+ }
+ dprintf("> addr=%s opcode=%d\n",
+ inet_ntoa(conn->addr.sin_addr), opcode);
+ dprintf("> vnet=%d vmac=" MACFMT " addr=" IPFMT "\n",
+ ntohl(vnet), MAC6TUPLE(vmac->mac), NIPQUAD(addr));
+ err = marshal_bytes(conn->out, &varph, varp_n);
+ marshal_flush(conn->out);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/* Test some flags.
+ *
+ * @param z varp entry
+ * @param flags to test
+ * @return nonzero if flags set
+ */
+int VCEntry_get_flags(VCEntry *z, int flags){
+ return z->flags & flags;
+}
+
+/** Set some flags.
+ *
+ * @param z varp entry
+ * @param flags to set
+ * @param set set flags on if nonzero, off if zero
+ * @return new flags value
+ */
+int VCEntry_set_flags(VCEntry *z, int flags, int set){
+ if(set){
+ z->flags |= flags;
+ } else {
+ z->flags &= ~flags;
+ }
+ return z->flags;
+}
+
+/** Print a varp entry.
+ *
+ * @param ventry varp entry
+ */
+void VCEntry_print(VCEntry *ventry){
+ if(ventry){
+ char *c, *d;
+ switch(ventry->state){
+ case VCACHE_STATE_INCOMPLETE: c = "INC"; break;
+ case VCACHE_STATE_REACHABLE: c = "RCH"; break;
+ case VCACHE_STATE_FAILED: c = "FLD"; break;
+ default: c = "UNK"; break;
+ }
+ d = (VCEntry_get_flags(ventry, VCACHE_FLAG_PROBING) ? "P" : " ");
+
+ printf("VENTRY(%p %s %s vnet=%d vmac=" MACFMT " addr=" IPFMT " time=%g)\n",
+ ventry,
+ c, d,
+ ntohl(ventry->key.vnet),
+ MAC6TUPLE(ventry->key.vmac.mac),
+ NIPQUAD(ventry->addr),
+ ventry->timestamp);
+ } else {
+ printf("VENTRY: Null!\n");
+ }
+}
+
+int VCEntry_schedule(VCEntry *ventry);
+void VCEntry_solicit(VCEntry *ventry);
+
+/** Function called when a varp entry timer goes off.
+ * If the entry is still incomplete, carries on probing.
+ * Otherwise stops probing.
+ *
+ * @param arg ventry
+ */
+static void ventry_timer_fn(Timer *timer){
+ VCEntry *ventry = timer->data;
+ int probing = 0, scheduled = 0;
+
+ //dprintf(">\n"); VCEntry_print(ventry);
+ if(ventry->state == VCACHE_STATE_REACHABLE){
+ // Do nothing.
+ } else {
+ // Probe if haven't run out of tries, otherwise fail.
+ if(ventry->probes < VCACHE_PROBE_MAX){
+ //probing = 1;
+ ventry->probes++;
+ scheduled = VCEntry_schedule(ventry);
+ //VCEntry_solicit(ventry);
+ probing = scheduled;
+ } else {
+ ventry->state = VCACHE_STATE_FAILED;
+ IPMessageQueue_clear(&ventry->queue);
+ }
+ }
+ if(!probing){
+ VCEntry_set_flags(ventry,
+ (VCACHE_FLAG_PROBING
+ | VCACHE_FLAG_REMOTE_PROBE
+ | VCACHE_FLAG_LOCAL_PROBE),
+ 0);
+ }
+ VCEntry_set_flags(ventry, VCACHE_FLAG_PROBING, probing);
+ //dprintf("<\n");
+}
+
+/** Schedule the varp entry timer.
+ *
+ * @param ventry varp entry
+ */
+int VCEntry_schedule(VCEntry *ventry){
+ int scheduled = 0;
+ if(ventry->probes == 1){
+ scheduled = 1;
+ Timer_set(VCACHE_LOCAL_DELAY, ventry_timer_fn, ventry);
+ } else {
+ VCEntry_solicit(ventry);
+ }
+ return scheduled;
+}
+
+/** Create a varp entry. Initializes the internal state.
+ *
+ * @param vnet vnet id
+ * @param vmac virtual MAC address (copied)
+ * @return ventry or null
+ */
+VCEntry * VCEntry_new(uint32_t vnet, Vmac *vmac){
+ VCEntry *z = ALLOCATE(VCEntry);
+ z->state = VCACHE_STATE_INCOMPLETE;
+ z->timestamp = time_now();
+ z->key.vnet = vnet;
+ z->key.vmac = *vmac;
+ return z;
+}
+
+/** Hash function for keys in the varp cache.
+ * Hashes the vnet id and mac.
+ *
+ * @param k key (VCKey)
+ * @return hashcode
+ */
+Hashcode vcache_key_hash_fn(void *k){
+ VCKey *key = k;
+ Hashcode h;
+ h = hash_2ul(key->vnet,
+ (key->vmac.mac[0] << 24) |
+ (key->vmac.mac[1] << 16) |
+ (key->vmac.mac[2] << 8) |
+ (key->vmac.mac[3] ));
+ h = hash_hul(h,
+ (key->vmac.mac[4] << 8) |
+ (key->vmac.mac[5] ));
+ return h;
+}
+
+/** Test equality for keys in the varp cache.
+ * Compares vnet and mac.
+ *
+ * @param k1 key to compare (VCKey)
+ * @param k2 key to compare (VCKey)
+ * @return 1 if equal, 0 otherwise
+ */
+int vcache_key_equal_fn(void *k1, void *k2){
+ VCKey *key1 = k1;
+ VCKey *key2 = k2;
+ return (key1->vnet == key2->vnet)
+ && (memcmp(key1->vmac.mac, key2->vmac.mac, ETH_ALEN) == 0);
+}
+
+void VarpCache_schedule(VarpCache *z);
+
+/** Function called when the varp table timer goes off.
+ * Sweeps old varp cache entries and reschedules itself.
+ *
+ * @param arg varp table
+ */
+static void vcache_timer_fn(Timer *timer){
+ VarpCache *z = timer->data;
+ //dprintf("> z=%p\n", z);
+ if(z){
+ VarpCache_sweep(z, 0);
+ VarpCache_schedule(z);
+ }
+ //dprintf("<\n");
+}
+
+/** Schedule the varp table timer.
+ *
+ * @param z varp table
+ */
+void VarpCache_schedule(VarpCache *z){
+ Timer_set(VCACHE_ENTRY_TTL, vcache_timer_fn, z);
+}
+
+/** Print a varp table.
+ *
+ * @param z table
+ */
+void VarpCache_print(VarpCache *z){
+ HashTable_for_decl(entry);
+ VCEntry *ventry;
+
+ dprintf(">\n");
+ HashTable_for_each(entry, vcache->table){
+ ventry = entry->value;
+ VCEntry_print(ventry);
+ }
+ dprintf("<\n");
+}
+
+/** Print the varp cache.
+ */
+void vcache_print(void){
+ VarpCache_print(vcache);
+}
+
+/** Create a varp table.
+ *
+ * @return new table or null
+ */
+VarpCache * VarpCache_new(void){
+ VarpCache *z = NULL;
+
+ z = ALLOCATE(VarpCache);
+ z->table = HashTable_new(VCACHE_BUCKETS);
+ z->table->key_equal_fn = vcache_key_equal_fn;
+ z->table->key_hash_fn = vcache_key_hash_fn;
+ VarpCache_schedule(z);
+ return z;
+}
+
+/** Add a new entry to the varp table.
+ *
+ * @param z table
+ * @param vnet vnet id
+ * @param vmac virtual MAC address (copied)
+ * @return new entry or null
+ */
+VCEntry * VarpCache_add(VarpCache *z, uint32_t vnet, Vmac *vmac){
+ VCEntry *ventry;
+ HTEntry *entry;
+
+ ventry = VCEntry_new(vnet, vmac);
+ //dprintf("> "); VCEntry_print(ventry);
+ entry = HashTable_add(z->table, ventry, ventry);
+ return ventry;
+}
+
+/** Remove an entry from the varp table.
+ *
+ * @param z table
+ * @param ventry entry to remove
+ * @return removed count
+ */
+int VarpCache_remove(VarpCache *z, VCEntry *ventry){
+ return HashTable_remove(z->table, ventry);
+}
+
+/** Lookup an entry in the varp table.
+ *
+ * @param z table
+ * @param vnet vnet id
+ * @param vmac virtual MAC addres
+ * @return entry found or null
+ */
+VCEntry * VarpCache_lookup(VarpCache *z, uint32_t vnet, Vmac *vmac){
+ VCKey key = { .vnet = vnet, .vmac = *vmac };
+ VCEntry *ventry;
+ ventry = HashTable_get(z->table, &key);
+ return ventry;
+}
+
+void VCEntry_solicit(VCEntry *ventry){
+ dprintf(">\n");
+ if(VCEntry_get_flags(ventry, VCACHE_FLAG_LOCAL_PROBE)){
+ dprintf("> local probe\n");
+ varp_send(vnetd->bcast_conn, VARP_OP_REQUEST, ventry->key.vnet, &ventry->key.vmac, ventry->addr);
+ }
+ if(VCEntry_get_flags(ventry, VCACHE_FLAG_REMOTE_PROBE)){
+ ConnList *l;
+ dprintf("> remote probe\n");
+ for(l = vnetd->connections; l; l = l->next){
+ varp_send(l->conn, VARP_OP_REQUEST, ventry->key.vnet, &ventry->key.vmac, ventry->addr);
+ }
+
+ }
+ dprintf("<\n");
+}
+
+int VCEntry_resolve(VCEntry *ventry, IPMessage *msg, int flags){
+ int err = 0;
+
+ dprintf("> "); //VCEntry_print(ventry);
+ ventry->state = VCACHE_STATE_INCOMPLETE;
+ VCEntry_set_flags(ventry, flags, 1);
+ IPMessageQueue_add(&ventry->queue, msg);
+ if(!VCEntry_get_flags(ventry, VCACHE_FLAG_PROBING)){
+ VCEntry_set_flags(ventry, VCACHE_FLAG_PROBING, 1);
+ ventry->probes = 1;
+ VCEntry_schedule(ventry);
+ //VCEntry_solicit(ventry);
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Update a ventry. Sets the address and state to those given
+ * and sets the timestamp to 'now'.
+ *
+ * @param ventry varp entry
+ * @param addr care-of address
+ * @param state state
+ * @return 0 on success, error code otherwise
+ */
+int VCEntry_update(VCEntry *ventry, IPMessage *msg, VarpHdr *varph, int state){
+ int err = 0;
+ double now = time_now();
+
+ if(VCEntry_get_flags(ventry, VCACHE_FLAG_PERMANENT)) goto exit;
+ ventry->addr = varph->addr;
+ ventry->timestamp = now;
+ ventry->state = state;
+ if(ventry->state == VCACHE_STATE_REACHABLE){
+ // Process the output queue.
+ IPMessage *msg;
+ while((msg = IPMessageQueue_pop(&ventry->queue))){
+ dprintf("> announce\n");
+ varp_send(msg->conn, VARP_OP_ANNOUNCE, ventry->key.vnet, &ventry->key.vmac, ventry->addr);
+ }
+ }
+ exit:
+ return err;
+}
+
+/** Update the ventry corresponding to the given varp header.
+ *
+ * @param z table
+ * @param varph varp header
+ * @param state state
+ * @return 0 on success, -ENOENT if no entry found
+ */
+int VarpCache_update(VarpCache *z, IPMessage *msg, VarpHdr *varph, int state){
+ int err = 0;
+ VCEntry *ventry;
+
+ dprintf(">\n");
+ ventry = VarpCache_lookup(z, varph->vnet, &varph->vmac);
+ if(ventry){
+ err = VCEntry_update(ventry, msg, varph, state);
+ } else {
+ err = -ENOENT;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+
+/** Put old varp entries into the incomplete state.
+ * Permanent entries are not changed.
+ * If 'all' is non-zero, all non-permanent entries
+ * are put into the incomplete state, regardless of age.
+ *
+ * @param z table
+ * @param all reset all entries if non-zero
+ */
+void VarpCache_sweep(VarpCache *z, int all){
+ HashTable_for_decl(entry);
+ VCEntry *ventry;
+ double now = time_now();
+ double old = now - VCACHE_ENTRY_TTL;
+
+ dprintf(">\n");
+ HashTable_for_each(entry, vcache->table){
+ ventry = entry->value;
+ if(!VCEntry_get_flags(ventry, VCACHE_FLAG_PERMANENT) &&
+ (all || (ventry->timestamp < old))){
+ ventry->state = VCACHE_STATE_INCOMPLETE;
+ }
+ }
+ dprintf("<\n");
+}
+
+/** Forward a varp message.
+ * If local forwards it to remote vnetds.
+ * If not local forwards it to local net.
+ *
+ * @param varph varp message to forward
+ * @param local whether it's local or not
+ */
+void vcache_forward_varp(VarpHdr *varph, int local){
+ uint16_t opcode = ntohs(varph->opcode);
+ if(local){
+ ConnList *l;
+ for(l = vnetd->connections; l; l = l->next){
+ varp_send(l->conn, opcode, varph->vnet, &varph->vmac, varph->addr);
+ }
+ } else {
+ varp_send(vnetd->bcast_conn, opcode, varph->vnet, &varph->vmac, varph->addr);
+ }
+}
+
+/** Handle a varp request.
+ *
+ * @param msg incoming message
+ * @param varph varp message
+ * @return 0 if ok, -ENOENT if no matching vif, or error code
+ */
+#if 1
+int vcache_handle_request(IPMessage *msg, VarpHdr *varph, int local){
+ dprintf("> local=%d\n", local);
+ vcache_forward_varp(varph, local);
+ dprintf("<\n");
+ return 0;
+}
+
+#else
+int vcache_handle_request(IPMessage *msg, VarpHdr *varph, int local){
+ int err = -ENOENT;
+ uint32_t vnet;
+ Vmac *vmac;
+ VCEntry *ventry = NULL;
+ int reply = 0;
+
+ dprintf(">\n");
+ vnet = htonl(varph->vnet);
+ vmac = &varph->vmac;
+ ventry = VarpCache_lookup(vcache, vnet, vmac);
+ if(!ventry){
+ ventry = VarpCache_add(vcache, vnet, vmac);
+ }
+ if(local){
+ // Request coming from the local subnet (on our udp port).
+ if(ventry->state == VCACHE_STATE_REACHABLE){
+ if(local){
+ // Have an entry, and it's non-local - reply (locally).
+ // Potential out-of-date cache problem.
+ // Should query remotely instead of replying.
+ varp_send(conn, VARP_OP_ANNOUNCE, ventry);
+ }
+ } else {
+ // Incomplete entry. Resolve.
+ VCEntry_resolve(ventry, msg, VCACHE_FLAG_REMOTE_PROBE);
+ }
+ } else {
+ // Non-local request (on one of our tcp connetions).
+ if(ventry->state == VCACHE_STATE_REACHABLE){
+ if(local){
+ // Have an entry and it's local - reply (remotely).
+ // Potential out-of-date cache problem.
+ // Should query locally instead of replying.
+ varp_send(msg->conn, VARP_OP_ANNOUNCE, ventry);
+ } else {
+ // Have a non-local entry - do nothing and assume someone else
+ // will reply.
+ }
+ } else {
+ // Incomplete entry. Resolve.
+ VCEntry_resolve(ventry, msg, VCACHE_FLAG_LOCAL_PROBE);
+ }
+ }
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+#endif
+
+/** Handle a varp announce message.
+ * Update the matching ventry if we have one.
+ *
+ * @param msg incoming message
+ * @param varp message
+ * @return 0 if OK, -ENOENT if no matching entry
+ */
+int vcache_handle_announce(IPMessage *msg, VarpHdr *varph, int local){
+ int err = 0;
+
+ vcache_forward_varp(varph, local);
+ err = VarpCache_update(vcache, msg, varph, VCACHE_STATE_REACHABLE);
+ return err;
+}
+
+/** Handle an incoming varp message.
+ *
+ * @param msg incoming message
+ * @return 0 if OK, error code otherwise
+ */
+int vcache_handle_message(IPMessage *msg, int local){
+ int err = -EINVAL;
+ VnetMsg *vmsg = msg->data;
+ VarpHdr *varph = &vmsg->varp.varph;
+
+ dprintf(">\n");
+ if(1){
+ dprintf("> src=%s:%d\n", inet_ntoa(msg->saddr.sin_addr), ntohs(msg->saddr.sin_port));
+ dprintf("> dst=%s:%d\n", inet_ntoa(msg->daddr.sin_addr), ntohs(msg->daddr.sin_port));
+ dprintf("> opcode=%d vnet=%u vmac=" MACFMT "\n",
+ ntohs(varph->opcode), ntohl(varph->vnet), MAC6TUPLE(varph->vmac.mac));
+ }
+ switch(ntohs(varph->opcode)){
+ case VARP_OP_REQUEST:
+ err = vcache_handle_request(msg, varph, local);
+ break;
+ case VARP_OP_ANNOUNCE:
+ err = vcache_handle_announce(msg, varph, local);
+ break;
+ default:
+ break;
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Initialize the varp cache.
+ *
+ * @return 0 on success, error code otherwise
+ */
+int vcache_init(void){
+ int err = 0;
+
+ if(!vcache){
+ vcache = VarpCache_new();
+ }
+ return err;
+}
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version. This library is
+ * distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _VNET_VCACHE_H_
+#define _VNET_VCACHE_H_
+
+#include "hash_table.h"
+
+/** Time-to-live of varp cache entries (in seconds).*/
+#define VCACHE_ENTRY_TTL 30.0
+
+/** Maximum number of varp probes to make. */
+#define VCACHE_PROBE_MAX 5
+
+/** Interval between varp probes (in seconds). */
+#define VCACHE_PROBE_INTERVAL 3.0
+
+/** Delay before forwarding a local probe (in seconds). */
+#define VCACHE_LOCAL_DELAY 2.0
+
+/** Number of buckets in the varp cache (must be prime). */
+#define VCACHE_BUCKETS 3001
+
+enum {
+ VCACHE_STATE_INCOMPLETE = 1,
+ VCACHE_STATE_REACHABLE = 2,
+ VCACHE_STATE_FAILED = 3
+};
+
+enum {
+ VCACHE_FLAG_PROBING = 1,
+ VCACHE_FLAG_PERMANENT = 2,
+ VCACHE_FLAG_LOCAL_PROBE = 4,
+ VCACHE_FLAG_REMOTE_PROBE = 8,
+};
+
+
+#include <asm/byteorder.h>
+/*
+ * Display an IP address in readable format.
+ */
+
+#define NIPQUAD(addr) \
+ ((unsigned char *)&addr)[0], \
+ ((unsigned char *)&addr)[1], \
+ ((unsigned char *)&addr)[2], \
+ ((unsigned char *)&addr)[3]
+
+#if defined(__LITTLE_ENDIAN)
+#define HIPQUAD(addr) \
+ ((unsigned char *)&addr)[3], \
+ ((unsigned char *)&addr)[2], \
+ ((unsigned char *)&addr)[1], \
+ ((unsigned char *)&addr)[0]
+#elif defined(__BIG_ENDIAN)
+#define HIPQUAD NIPQUAD
+#else
+#error "Please fix asm/byteorder.h"
+#endif /* __LITTLE_ENDIAN */
+
+#define IPFMT "%u.%u.%u.%u"
+#define MACFMT "%02x:%02x:%02x:%02x:%02x:%02x"
+
+#define MAC6TUPLE(_mac) (_mac)[0], (_mac)[1], (_mac)[2], (_mac)[3], (_mac)[4], (_mac)[5]
+
+typedef struct IPMessage {
+ Conn *conn;
+ struct sockaddr_in saddr;
+ struct sockaddr_in daddr;
+ void *data;
+ struct IPMessage *next;
+} IPMessage;
+
+typedef struct IPMessageQueue {
+ IPMessage *msg;
+ int len;
+ int maxlen;
+} IPMessageQueue;
+
+/** Key for varp cache entries. */
+typedef struct VCKey {
+ /** Vnet id (network order). */
+ uint32_t vnet;
+ /** Virtual MAC address. */
+ Vmac vmac;
+} VCKey;
+
+typedef struct VCEntry {
+ /** Key for the entry. */
+ VCKey key;
+
+ /** Care-of address for the key. */
+ uint32_t addr;
+
+ /** Alias coa if we are a gateway. */
+ //uint32_t gateway;
+ /** Encapsulation to use (if a gateway). */
+ //uint32_t encaps;
+
+ /** Where this entry came from. */
+ uint32_t source;
+
+ /** Last-updated timestamp. */
+ double timestamp;
+
+ /** State. */
+ short state;
+
+ /** Flags. */
+ short flags;
+
+ /** Number of probes sent. */
+ int probes;
+
+ /** List of messages to reply to when completes. */
+ IPMessageQueue queue;
+
+} VCEntry;
+
+/** The varp cache. Varp cache entries indexed by VCKey. */
+typedef struct VarpCache {
+ HashTable *table;
+} VarpCache;
+
+int vcache_init(void);
+int vcache_handle_message(IPMessage *msg, int local);
+
+#endif /* ! _VNET_VCACHE_H_ */
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version. This library is
+ * distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+/** @file
+ *
+ * Vnetd tcp messages:
+ *
+ * - varp request: request care-of-addr for a vif.
+ * If know answer, reply. If not broadcast locally.
+ *
+ * - varp announce: reply to a varp request.
+ * If a (local) request is pending, remember and broadcast locally.
+ *
+ * - vnet subscribe: indicate there are local vifs in a vnet (use varp announce?).
+ *
+ * - vnet forward: tunneled broadcast packet to rebroadcast.
+ * Broadcast locally (if there are vifs in the vnet).
+ *
+ *
+ * Vnetd udp messages (varp):
+ *
+ * - local varp request:
+ * If know and vif is non-local, reply.
+ * If know and vif is local, do nothing (but announce will reset).
+ * If have entry saying is local and no-one answers - remove (? or rely on entry timeout).
+ * If don't know and there is no (quick) local reply, forward to peers.
+ *
+ * - remote varp request:
+ * If know, reply.
+ * If don't know, query locally (and queue request).
+ *
+ * - varp announce: remember and adjust vnet subscriptions.
+ * Forward to peers if a request is pending.
+ *
+ * Vnetd broadcast messages (tunneling):
+ *
+ * - etherip: forward to peers (on the right vnets)
+ *
+ * - esp: forward to peers (on the right vnets)
+ *
+ *
+ * For etherip can tell the vnet from the header (in clear).
+ * But for esp can't. So should use mcast to define? Or always some clear header?
+ *
+ * Make ssl on tcp connections optional.
+ *
+ * So far have been assuming esp for security.
+ * But could use vnetd to forward and use ssl on the connection.
+ * But has usual probs with efficiency.
+ * However, should 'just work' if the coa for the vif has been set
+ * to the vnetd. How? Vnetd configured to act as gateway for
+ * some peers? Then would rewrite varp announce to itself and forward
+ * traffic to peer.
+ *
+ * Simplify - make each vnetd have one peer?
+ * If need to link more subnets, add vnetds?
+ *
+ * Need requests table for each tcp conn (incoming).
+ * - entries we want to resolve (and fwd the answer).
+ *
+ * Need requests table for the udp socket.
+ * - entries we want to resolve (and return the answer).
+ *
+ * Need table of entries we know.
+ * - from caching local announce
+ * - from caching announce reply to forwarded request
+ *
+ * Problem with replying to requests from the cache - if the cache
+ * is out of date we reply with incorrect data. So if a VM migrates
+ * we will advertise the old location until it times out.
+ *
+ * So should probably not reply out of the cache at all - but always
+ * query for the answer. Could query direct to old location if
+ * entry is valid the first time, and broadcast if no reply in timeout.
+ * Causes delay if migrated - may as well broadcast.
+ *
+ * Need to watch out for query loops. If have 3 vnetds A,B,C and
+ * A gets a query, forwards to B and C. B forwards to C, which
+ * forwards to A, and on forever. So if have an entry that has been
+ * probed, do not forward it when get another query for it.
+ *
+ * @author Mike Wray <mike.wray@hpl.hp.com>
+ */
+
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <getopt.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <time.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <string.h>
+
+#include <signal.h>
+#include <sys/wait.h>
+#include <sys/select.h>
+
+//#include </usr/include/linux/ip.h> // For struct iphdr;
+#include <linux/ip.h> // For struct iphdr;
+
+#include <linux/if_ether.h>
+#include "if_etherip.h"
+#include "if_varp.h"
+
+#include "allocate.h"
+
+#include "vnetd.h"
+#include "file_stream.h"
+#include "string_stream.h"
+#include "socket_stream.h"
+#include "sys_net.h"
+
+#include "enum.h"
+#include "sxpr.h"
+
+#include "marshal.h"
+#include "connection.h"
+#include "select.h"
+#include "timer.h"
+#include "vcache.h"
+
+int create_socket(int socktype, uint32_t saddr, uint32_t port, int flags, Conn **val);
+
+#ifndef TRUE
+#define TRUE 1
+#endif
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+
+/** Socket flags. */
+enum {
+ VSOCK_REUSE=1,
+ VSOCK_BIND=2,
+ VSOCK_CONNECT=4,
+ VSOCK_BROADCAST=8,
+ VSOCK_MULTICAST=16,
+ };
+
+#define PROGRAM "vnetd"
+#define VERSION "0.1"
+
+#define MODULE_NAME PROGRAM
+#define DEBUG
+#undef DEBUG
+#include "debug.h"
+
+#define OPT_PORT 'p'
+#define KEY_PORT "port"
+#define DOC_PORT "<port>\n\t" PROGRAM " UDP port (as a number or service name)"
+
+#define OPT_ADDR 'm'
+#define KEY_ADDR "mcaddr"
+#define DOC_ADDR "<address>\n\t" PROGRAM " multicast address"
+
+#define OPT_PEER 'r'
+#define KEY_PEER "peer"
+#define DOC_PEER "<peer>\n\t Peer " PROGRAM " to connect to (IP address or hostname)"
+
+#define OPT_FILE 'f'
+#define KEY_FILE "file"
+#define DOC_FILE "<file>\n\t Configuration file to load"
+
+#define OPT_CTRL 'c'
+#define KEY_CTRL "control"
+#define DOC_CTRL "<port>\n\t " PROGRAM " control port (as a number or service name)"
+
+#define OPT_HELP 'h'
+#define KEY_HELP "help"
+#define DOC_HELP "\n\tprint help"
+
+#define OPT_VERSION 'v'
+#define KEY_VERSION "version"
+#define DOC_VERSION "\n\tprint version"
+
+#define OPT_VERBOSE 'V'
+#define KEY_VERBOSE "verbose"
+#define DOC_VERBOSE "\n\tverbose flag"
+
+/** Print a usage message.
+ * Prints to stdout if err is zero, and exits with 0.
+ * Prints to stderr if err is non-zero, and exits with 1.
+ *
+ * @param err error code
+ */
+static void usage(int err){
+ FILE *out = (err ? stderr : stdout);
+
+ fprintf(out, "Usage: %s [options]\n", PROGRAM);
+ fprintf(out, "-%c, --%s %s\n", OPT_ADDR, KEY_ADDR, DOC_ADDR);
+ fprintf(out, "-%c, --%s %s\n", OPT_PORT, KEY_PORT, DOC_PORT);
+ fprintf(out, "-%c, --%s %s\n", OPT_PEER, KEY_PEER, DOC_PEER);
+ fprintf(out, "-%c, --%s %s\n", OPT_VERBOSE, KEY_VERBOSE, DOC_VERBOSE);
+ fprintf(out, "-%c, --%s %s\n", OPT_VERSION, KEY_VERSION, DOC_VERSION);
+ fprintf(out, "-%c, --%s %s\n", OPT_HELP, KEY_HELP, DOC_HELP);
+ exit(err ? 1 : 0);
+}
+
+/** Short options. Options followed by ':' take an argument. */
+static char *short_opts = (char[]){
+ OPT_ADDR, ':',
+ OPT_PORT, ':',
+ OPT_PEER, ':',
+ OPT_HELP,
+ OPT_VERSION,
+ OPT_VERBOSE,
+ 0 };
+
+/** Long options. */
+static struct option const long_opts[] = {
+ { KEY_ADDR, required_argument, NULL, OPT_ADDR },
+ { KEY_PORT, required_argument, NULL, OPT_PORT },
+ { KEY_PEER, required_argument, NULL, OPT_PEER },
+ { KEY_HELP, no_argument, NULL, OPT_HELP },
+ { KEY_VERSION, no_argument, NULL, OPT_VERSION },
+ { KEY_VERBOSE, no_argument, NULL, OPT_VERBOSE },
+ { NULL, 0, NULL, 0 }
+};
+
+/** Get address of vnetd. So we can ignore broadcast traffic
+ * we sent ourselves.
+ *
+ * @param addr
+ * @return 0 on success, error code otherwise
+ */
+int get_self_addr(struct sockaddr_in *addr){
+ int err = 0;
+ char hostname[1024] = {};
+ unsigned long saddr;
+
+ //dprintf(">\n");
+ err = gethostname(hostname, sizeof(hostname) -1);
+ if(err) goto exit;
+ err = get_host_address(hostname, &saddr);
+ if(err == 0){ err = -ENOENT; goto exit; }
+ err = 0;
+ addr->sin_addr.s_addr = saddr;
+ exit:
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Marshal a message.
+ *
+ * @param io destination
+ * @param msg message
+ * @return number of bytes written, or negative error code
+ */
+int VnetMsg_marshal(IOStream *io, VnetMsg *msg){
+ int err = 0;
+ int hdr_n = sizeof(VnetMsgHdr);
+
+ err = marshal_uint16(io, msg->hdr.id);
+ if(err < 0) goto exit;
+ err = marshal_uint16(io, msg->hdr.opcode);
+ if(err < 0) goto exit;
+ switch(msg->hdr.id){
+ case VNET_VARP_ID:
+ err = marshal_bytes(io, ((char*)msg) + hdr_n, sizeof(VarpHdr) - hdr_n);
+ break;
+ case VNET_FWD_ID:
+ err = marshal_uint16(io, msg->fwd.protocol);
+ if(err < 0) goto exit;
+ err = marshal_uint16(io, msg->fwd.len);
+ if(err < 0) goto exit;
+ err = marshal_bytes(io, msg->fwd.data, msg->fwd.len);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+ exit:
+ return err;
+}
+
+/** Unmarshal a message.
+ *
+ * @param io source
+ * @param msg message to unmarshal into
+ * @return number of bytes read, or negative error code
+ */
+int VnetMsg_unmarshal(IOStream *io, VnetMsg *msg){
+ int err = 0;
+ int hdr_n = sizeof(VnetMsgHdr);
+
+ dprintf("> id\n");
+ err = unmarshal_uint16(io, &msg->hdr.id);
+ if(err < 0) goto exit;
+ dprintf("> opcode\n");
+ err = unmarshal_uint16(io, &msg->hdr.opcode);
+ if(err < 0) goto exit;
+ switch(msg->hdr.id){
+ case VNET_VARP_ID:
+ msg->hdr.opcode = htons(msg->hdr.opcode);
+ dprintf("> varp hdr_n=%d varphdr=%d\n", hdr_n, sizeof(VarpHdr));
+ err = unmarshal_bytes(io, ((char*)msg) + hdr_n, sizeof(VarpHdr) - hdr_n);
+ break;
+ case VNET_FWD_ID:
+ dprintf("> forward\n");
+ err = unmarshal_uint16(io, &msg->fwd.protocol);
+ if(err < 0) goto exit;
+ dprintf("> forward len\n");
+ err = unmarshal_uint16(io, &msg->fwd.len);
+ if(err < 0) goto exit;
+ dprintf("> forward bytes\n");
+ err = unmarshal_bytes(io, msg->fwd.data, msg->fwd.len);
+ break;
+ default:
+ wprintf("> Invalid id %d\n", msg->hdr.id);
+ err = -EINVAL;
+ break;
+ }
+ exit:
+ dprintf("< err=%d \n", err);
+ return err;
+}
+
+Vnetd _vnetd = {};
+Vnetd *vnetd = &_vnetd;
+
+/** Counter for timer alarms.
+ */
+static unsigned timer_alarms = 0;
+
+/** Set vnetd defaults.
+ *
+ * @param vnetd vnetd
+ */
+void vnetd_set_defaults(Vnetd *vnetd){
+ *vnetd = (Vnetd){};
+ vnetd->port = htons(VNETD_PORT);
+ vnetd->peer_port = vnetd->port; //htons(VNETD_PEER_PORT);
+ vnetd->verbose = FALSE;
+ vnetd->peers = ONULL;
+ vnetd->mcast_addr.sin_addr.s_addr = VARP_MCAST_ADDR;
+ vnetd->mcast_addr.sin_port = vnetd->port;
+}
+
+uint32_t vnetd_mcast_addr(Vnetd *vnetd){
+ return vnetd->mcast_addr.sin_addr.s_addr;
+}
+
+uint16_t vnetd_mcast_port(Vnetd *vnetd){
+ return vnetd->mcast_addr.sin_port;
+}
+
+/** Add a connection to a peer.
+ *
+ * @param vnetd vnetd
+ * @param conn connection
+ */
+void connections_add(Vnetd *vnetd, Conn *conn){
+ vnetd->connections = ConnList_add(conn, vnetd->connections);
+}
+
+/** Delete a connection to a peer.
+ *
+ * @param vnetd vnetd
+ * @param conn connection
+ */
+void connections_del(Vnetd *vnetd, Conn *conn){
+ ConnList *prev, *curr, *next;
+ for(prev = NULL, curr = vnetd->connections; curr; prev = curr, curr = next){
+ next = curr->next;
+ if(curr->conn == conn){
+ if(prev){
+ prev->next = curr->next;
+ } else {
+ vnetd->connections = curr->next;
+ }
+ }
+ }
+}
+
+/** Close all connections to peers.
+ *
+ * @param vnetd vnetd
+ */
+void connections_close_all(Vnetd *vnetd){
+ ConnList *l;
+ for(l = vnetd->connections; l; l = l->next){
+ Conn_close(l->conn);
+ }
+ vnetd->connections = NULL;
+}
+
+/** Add peer connections to a select set.
+ *
+ * @param vnetd vnetd
+ * @param set select set
+ */
+void connections_select(Vnetd *vnetd, SelectSet *set){
+ ConnList *l;
+ for(l = vnetd->connections; l; l = l->next){
+ SelectSet_add_read(set, l->conn->sock);
+ }
+}
+
+/** Handle peer connections according to a select set.
+ *
+ * @param vnetd vnetd
+ * @param set indicates ready connections
+ */
+void connections_handle(Vnetd *vnetd, SelectSet *set){
+ ConnList *prev, *curr, *next;
+ Conn *conn;
+ for(prev = NULL, curr = vnetd->connections; curr; prev = curr, curr = next){
+ next = curr->next;
+ conn = curr->conn;
+ if(FD_ISSET(conn->sock, &set->rd)){
+ int conn_err;
+ conn_err = Conn_handle(conn);
+ if(conn_err){
+ if(prev){
+ prev->next = curr->next;
+ } else {
+ vnetd->connections = curr->next;
+ }
+ }
+ }
+ }
+}
+
+/** Forward a message from a peer onto the local subnet.
+ *
+ * @param vnetd vnetd
+ * @param vmsg message
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_forward_local(Vnetd *vnetd, VnetMsg *vmsg){
+ int err = 0;
+ int sock = 0;
+ struct sockaddr_in addr_in;
+ struct sockaddr *addr = (struct sockaddr *)&addr_in;
+ socklen_t addr_n = sizeof(addr_in);
+
+ dprintf(">\n");
+ switch(vmsg->fwd.protocol){
+ case IPPROTO_ESP:
+ dprintf("> ESP\n");
+ sock = vnetd->esp_sock; break;
+ case IPPROTO_ETHERIP:
+ dprintf("> Etherip\n");
+ sock = vnetd->etherip_sock; break;
+ default:
+ err = -EINVAL;
+ goto exit;
+ }
+ addr_in.sin_family = AF_INET;
+ addr_in.sin_addr = vnetd->mcast_addr.sin_addr;
+ addr_in.sin_port = htons(vmsg->fwd.protocol);
+ dprintf("> send dst=%s protocol=%d len=%d\n",
+ inet_ntoa(addr_in.sin_addr), vmsg->fwd.protocol, vmsg->fwd.len);
+ err = sendto(sock, vmsg->fwd.data, vmsg->fwd.len, 0, addr, addr_n);
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Forward a message to a peer.
+ *
+ * @param conn peer connection
+ * @param protocol message protocol
+ * @param data message data
+ * @param data_n message size
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_forward_peer(Conn *conn, int protocol, void *data, int data_n){
+ int err = 0;
+ IOStream _io, *io = &_io;
+ StringData sdata;
+ char buf[1600];
+
+ dprintf("> addr=%s protocol=%d n=%d\n",
+ inet_ntoa(conn->addr.sin_addr), protocol, data_n);
+ string_stream_init(io, &sdata, buf, sizeof(buf));
+ dprintf("> 10\n");
+ err = marshal_uint16(io, VNET_FWD_ID);
+ if(err < 0) goto exit;
+ dprintf("> 20\n");
+ err = marshal_uint16(io, 0);
+ if(err < 0) goto exit;
+ dprintf("> 30\n");
+ err = marshal_uint16(io, protocol);
+ if(err < 0) goto exit;
+ dprintf("> 40\n");
+ err = marshal_uint16(io, data_n);
+ if(err < 0) goto exit;
+ dprintf("> 50\n");
+ err = marshal_bytes(io, data, data_n);
+ if(err < 0) goto exit;
+ dprintf("> 60 bytes=%d\n", IOStream_get_written(io));
+ err = IOStream_write(conn->out, buf, IOStream_get_written(io));
+ IOStream_flush(conn->out);
+ exit:
+ if(err < 0) perror(__FUNCTION__);
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Forward a message to all peers.
+ *
+ * @param vnetd vnetd
+ * @param protocol message protocol
+ * @param data message data
+ * @param data_n message size
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_forward_peers(Vnetd *vnetd, int protocol, void *data, int data_n){
+ int err = 0;
+ ConnList *curr, *next;
+
+ dprintf(">\n");
+ for(curr = vnetd->connections; curr; curr = next){
+ next = curr->next;
+ vnetd_forward_peer(curr->conn, protocol, data, data_n);
+ }
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Handler for a peer connection.
+ * Reads a VnetMsg from the connection and handles it.
+ *
+ * @param conn peer connection
+ * @return 0 on success, error code otherwise
+ */
+int conn_handle_fn(Conn *conn){
+ int err = 0;
+ VnetMsg *vmsg = ALLOCATE(VnetMsg);
+ IPMessage *msg = NULL;
+
+ dprintf("> addr=%s port=%u\n",
+ inet_ntoa(conn->addr.sin_addr),
+ ntohs(conn->addr.sin_port));
+ err = VnetMsg_unmarshal(conn->in, vmsg);
+ if(err < 0){
+ wprintf("> Unmarshal error %d\n", err);
+ goto exit;
+ }
+ switch(vmsg->hdr.id){
+ case VNET_VARP_ID:
+ dprintf("> Got varp message\n");
+ msg = ALLOCATE(IPMessage);
+ msg->conn = conn;
+ msg->saddr = conn->addr;
+ msg->data = vmsg;
+ err = vcache_handle_message(msg, 0);
+ err = 0;
+ break;
+ case VNET_FWD_ID:
+ dprintf("> Got forward message\n");
+ err = vnetd_forward_local(vnetd, vmsg);
+ err = 0;
+ break;
+ default:
+ wprintf("> Invalid id=%d\n", vmsg->hdr.id);
+ err = -EINVAL;
+ break;
+ }
+ exit:
+ dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Accept an incoming tcp connection from a peer vnetd.
+ *
+ * @param sock tcp socket
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_accept(Vnetd *vnetd, Conn *conn){
+ Conn *new_conn = NULL;
+ struct sockaddr_in peer_in;
+ struct sockaddr *peer = (struct sockaddr *)&peer_in;
+ socklen_t peer_n = sizeof(peer_in);
+ int peersock;
+ int err = 0;
+
+ //dprintf(">\n");
+ new_conn = Conn_new(conn_handle_fn, vnetd);
+ //dprintf("> accept...\n");
+ peersock = accept(conn->sock, peer, &peer_n);
+ //dprintf("> accept=%d\n", peersock);
+ if(peersock < 0){
+ perror("accept");
+ err = -errno;
+ goto exit;
+ }
+ iprintf("> Accepted connection from %s:%d\n",
+ inet_ntoa(peer_in.sin_addr), htons(peer_in.sin_port));
+ err = Conn_init(new_conn, peersock, SOCK_STREAM, peer_in);
+ if(err) goto exit;
+ connections_add(vnetd, new_conn);
+ exit:
+ if(err){
+ Conn_close(new_conn);
+ }
+ if(err < 0) wprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Connect to a peer vnetd.
+ *
+ * @param vnetd vnetd
+ * @param addr address
+ * @param port port
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_connect(Vnetd *vnetd, struct in_addr addr, uint16_t port){
+ Conn *conn = NULL;
+ int err = 0;
+
+ //dprintf(">\n");
+ conn = Conn_new(conn_handle_fn, vnetd);
+ err = Conn_connect(conn, SOCK_STREAM, addr, port);
+ if(err) goto exit;
+ connections_add(vnetd, conn);
+ exit:
+ if(err){
+ Conn_close(conn);
+ }
+ //dprintf(" < err=%d\n", err);
+ return err;
+}
+
+/** Handle a message on the udp socket.
+ * Expecting to see VARP messages only.
+ *
+ * @param sock udp socket
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_handle_udp(Vnetd *vnetd, Conn *conn){
+ int err = 0, rcv = 0;
+ struct sockaddr_in self_in;
+ struct sockaddr_in peer_in;
+ struct sockaddr *peer = (struct sockaddr *)&peer_in;
+ socklen_t peer_n = sizeof(peer_in);
+ VnetMsg *vmsg = NULL;
+ void *data;
+ int data_n;
+ int flags = 0;
+ IPMessage *msg = NULL;
+
+ //dprintf(">\n");
+ self_in = vnetd->addr;
+ vmsg = ALLOCATE(VnetMsg);
+ data = &vmsg->varp.varph;
+ data_n = sizeof(VarpHdr);
+ rcv = recvfrom(conn->sock, data, data_n, flags, peer, &peer_n);
+ if(rcv < 0){
+ err = rcv;
+ goto exit;
+ }
+ dprintf("> Received %d bytes from %s:%d\n",
+ rcv, inet_ntoa(peer_in.sin_addr), htons(peer_in.sin_port));
+ if(rcv != data_n){
+ err = -EINVAL;
+ goto exit;
+ }
+ if(peer_in.sin_addr.s_addr == self_in.sin_addr.s_addr){
+ //dprintf("> Ignoring message from self.\n");
+ goto exit;
+ }
+ msg = ALLOCATE(IPMessage);
+ msg->conn = conn;
+ msg->saddr = peer_in;
+ msg->data = vmsg;
+
+ err = vcache_handle_message(msg, 1);
+ exit:
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Handle a message on a raw socket.
+ * Only deals with etherip and esp.
+ * Forwards messages to peers.
+ *
+ * @param vnetd vnetd
+ * @param sock socket
+ * @param protocol protocol
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_handle_protocol(Vnetd *vnetd, int sock, int protocol){
+ int err = 0, rcv = 0;
+ struct sockaddr_in self_in;
+ struct sockaddr_in peer_in;
+ struct sockaddr *peer = (struct sockaddr *)&peer_in;
+ socklen_t peer_n = sizeof(peer_in);
+ uint8_t buf[VNET_FWD_MAX];
+ int buf_n = sizeof(buf);
+ char *data, *end;
+ int flags = 0;
+ struct iphdr *iph = NULL;
+
+ //dprintf(">\n");
+ self_in = vnetd->addr;
+ rcv = recvfrom(sock, buf, buf_n, flags, peer, &peer_n);
+ if(rcv < 0){
+ err = rcv;
+ goto exit;
+ }
+ dprintf("> Received %d bytes from %s protocol=%d\n",
+ rcv, inet_ntoa(peer_in.sin_addr), protocol);
+ if(rcv < sizeof(struct iphdr)){
+ wprintf("> Message too short for IP header\n");
+ err = -EINVAL;
+ goto exit;
+ }
+ if(peer_in.sin_addr.s_addr == self_in.sin_addr.s_addr){
+ dprintf("> Ignoring message from self.\n");
+ goto exit;
+ }
+ data = buf;
+ end = buf + rcv;
+ iph = (void*)data;
+ data += (iph->ihl << 2);
+ vnetd_forward_peers(vnetd, protocol, data, end - data);
+ exit:
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Socket select loop.
+ * Accepts connections on the tcp socket and handles
+ * messages on the other sockets.
+ *
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_select(Vnetd *vnetd){
+ int err = 0;
+ SelectSet set = {};
+ while(1){
+ SelectSet_zero(&set);
+ SelectSet_add_read(&set, vnetd->udp_conn->sock);
+ SelectSet_add_read(&set, vnetd->bcast_conn->sock);
+ SelectSet_add_read(&set, vnetd->etherip_sock);
+ SelectSet_add_read(&set, vnetd->esp_sock);
+ SelectSet_add_read(&set, vnetd->listen_conn->sock);
+ connections_select(vnetd, &set);
+ err = SelectSet_select(&set, NULL);
+ if(err == 0) continue;
+ if(err < 0){
+ if(errno == EINTR){
+ if(timer_alarms){
+ timer_alarms = 0;
+ process_timers();
+ }
+ continue;
+ }
+ perror("select");
+ goto exit;
+ }
+ if(FD_ISSET(vnetd->udp_conn->sock, &set.rd)){
+ vnetd_handle_udp(vnetd, vnetd->udp_conn);
+ }
+ if(FD_ISSET(vnetd->bcast_conn->sock, &set.rd)){
+ vnetd_handle_udp(vnetd, vnetd->bcast_conn);
+ }
+ if(FD_ISSET(vnetd->etherip_sock, &set.rd)){
+ vnetd_handle_protocol(vnetd, vnetd->etherip_sock, IPPROTO_ETHERIP);
+ }
+ if(FD_ISSET(vnetd->esp_sock, &set.rd)){
+ vnetd_handle_protocol(vnetd, vnetd->esp_sock, IPPROTO_ESP);
+ }
+ connections_handle(vnetd, &set);
+ if(FD_ISSET(vnetd->listen_conn->sock, &set.rd)){
+ vnetd_accept(vnetd, vnetd->listen_conn);
+ }
+ }
+ exit:
+ return err;
+}
+
+/** Set socket option to reuse address.
+ */
+int setsock_reuse(int sock, int reuse){
+ int err = 0;
+ err = setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse));
+ if(err < 0){
+ err = -errno;
+ perror("setsockopt SO_REUSEADDR");
+ }
+ return err;
+}
+
+/** Set socket broadcast option.
+ */
+int setsock_broadcast(int sock, int bcast){
+ int err = 0;
+ err = setsockopt(sock, SOL_SOCKET, SO_BROADCAST, &bcast, sizeof(bcast));
+ if(err < 0){
+ err = -errno;
+ perror("setsockopt SO_BROADCAST");
+ }
+ return err;
+}
+
+/** Join a socket to a multicast group.
+ */
+int setsock_multicast(int sock, uint32_t saddr){
+ int err = 0;
+ struct ip_mreqn mreq = {};
+ int mloop = 0;
+ // See 'man 7 ip' for these options.
+ mreq.imr_multiaddr.s_addr = saddr; // IP multicast address.
+ mreq.imr_address = vnetd->addr.sin_addr; // Interface IP address.
+ mreq.imr_ifindex = 0; // Interface index (0 means any).
+ err = setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &mloop, sizeof(mloop));
+ if(err < 0){
+ err = -errno;
+ perror("setsockopt IP_MULTICAST_LOOP");
+ goto exit;
+ }
+ err = setsockopt(sock, SOL_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq));
+ if(err < 0){
+ err = -errno;
+ perror("setsockopt IP_ADD_MEMBERSHIP");
+ goto exit;
+ }
+ exit:
+ return err;
+}
+
+/** Set a socket's multicast ttl (default is 1).
+ */
+int setsock_multicast_ttl(int sock, uint8_t ttl){
+ int err = 0;
+ err = setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl));
+ if(err < 0){
+ err = -errno;
+ perror("setsockopt IP_MULTICAST_TTL");
+ }
+ return err;
+}
+
+
+char * socket_flags(int flags){
+ static char s[6];
+ int i = 0;
+ s[i++] = (flags & VSOCK_CONNECT ? 'c' : '-');
+ s[i++] = (flags & VSOCK_BIND ? 'b' : '-');
+ s[i++] = (flags & VSOCK_REUSE ? 'r' : '-');
+ s[i++] = (flags & VSOCK_BROADCAST ? 'B' : '-');
+ s[i++] = (flags & VSOCK_MULTICAST ? 'M' : '-');
+ s[i++] = '\0';
+ return s;
+}
+
+/** Create a socket.
+ * The flags can include VSOCK_REUSE, VSOCK_BROADCAST, VSOCK_CONNECT.
+ *
+ * @param socktype socket type
+ * @param saddr address
+ * @param port port
+ * @param flags flags
+ * @param val return value for the socket connection
+ * @return 0 on success, error code otherwise
+ */
+int create_socket(int socktype, uint32_t saddr, uint32_t port, int flags, Conn **val){
+ int err = 0;
+ int sock = 0;
+ struct sockaddr_in addr_in;
+ struct sockaddr *addr = (struct sockaddr *)&addr_in;
+ socklen_t addr_n = sizeof(addr_in);
+ Conn *conn = NULL;
+ int reuse, bcast;
+
+ //dprintf(">\n");
+ reuse = (flags & VSOCK_REUSE);
+ bcast = (flags & VSOCK_BROADCAST);
+ addr_in.sin_family = AF_INET;
+ addr_in.sin_addr.s_addr = saddr;
+ addr_in.sin_port = port;
+ dprintf("> flags=%s addr=%s port=%d\n", socket_flags(flags),
+ inet_ntoa(addr_in.sin_addr), ntohs(addr_in.sin_port));
+
+ sock = socket(AF_INET, socktype, 0);
+ if(sock < 0){
+ err = -errno;
+ goto exit;
+ }
+ if(reuse){
+ err = setsock_reuse(sock, reuse);
+ if(err < 0) goto exit;
+ }
+ if(bcast){
+ err = setsock_broadcast(sock, bcast);
+ if(err < 0) goto exit;
+ }
+ if(flags & VSOCK_MULTICAST){
+ err = setsock_multicast(sock, saddr);
+ if(err < 0) goto exit;
+ }
+ if(flags & VSOCK_CONNECT){
+ err = connect(sock, addr, addr_n);
+ if(err < 0){
+ err = -errno;
+ perror("connect");
+ goto exit;
+ }
+ }
+ if(flags & VSOCK_BIND){
+ err = bind(sock, addr, addr_n);
+ if(err < 0){
+ err = -errno;
+ perror("bind");
+ goto exit;
+ }
+ }
+ conn = Conn_new(NULL, NULL);
+ Conn_init(conn, sock, socktype, addr_in);
+ {
+ struct sockaddr_in self = {};
+ socklen_t self_n;
+ getsockname(conn->sock, (struct sockaddr *)&self, &self_n);
+ dprintf("> sockname sock=%d addr=%s port=%d\n",
+ conn->sock, inet_ntoa(self.sin_addr), ntohs(self.sin_port));
+ }
+ exit:
+ *val = (err ? NULL : conn);
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Create the tcp listen socket.
+ *
+ * @param vnetd program arguments
+ * @param val return value for the socket
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_listen_conn(Vnetd *vnetd, Conn **val){
+ int err = 0;
+ int flags = VSOCK_BIND | VSOCK_REUSE;
+ //dprintf(">\n");
+ err = create_socket(SOCK_STREAM, INADDR_ANY, vnetd->peer_port, flags, val);
+ if(err) goto exit;
+ err = listen((*val)->sock, 5);
+ if(err < 0){
+ err = -errno;
+ perror("listen");
+ goto exit;
+ }
+ exit:
+ if(err && *val){
+ Conn_close(*val);
+ *val = NULL;
+ }
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Create the udp socket.
+ *
+ * @param vnetd program arguments
+ * @param val return value for the socket
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_udp_conn(Vnetd *vnetd, Conn **val){
+ int err = 0;
+ uint32_t addr = INADDR_ANY;
+ uint16_t port = vnetd->port;
+ int flags = VSOCK_BIND | VSOCK_REUSE;
+ err = create_socket(SOCK_DGRAM, addr, port, flags, val);
+ return err;
+}
+
+/** Create the broadcast socket.
+ *
+ * @param vnetd program arguments
+ * @param val return value for the socket
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_broadcast_conn(Vnetd *vnetd, Conn **val){
+ int err = 0;
+ uint32_t addr = vnetd_mcast_addr(vnetd);
+ uint16_t port = vnetd_mcast_port(vnetd);
+ int flags = VSOCK_REUSE;
+ int multicast = IN_MULTICAST(ntohl(addr));
+
+ flags |= VSOCK_MULTICAST;
+ flags |= VSOCK_BROADCAST;
+
+ err = create_socket(SOCK_DGRAM, addr, port, flags, val);
+ if(err < 0) goto exit;
+ if(multicast){
+ err = setsock_multicast_ttl((*val)->sock, 1);
+ if(err < 0) goto exit;
+ }
+ if(0){
+ struct sockaddr * addr = (struct sockaddr *)&vnetd->addr;
+ socklen_t addr_n = sizeof(vnetd->addr);
+ dprintf("> sock=%d bind addr=%s:%d\n",
+ (*val)->sock, inet_ntoa(vnetd->addr.sin_addr), ntohs(vnetd->addr.sin_port));
+ err = bind((*val)->sock, addr, addr_n);
+ if(err < 0){
+ err = -errno;
+ perror("bind");
+ goto exit;
+ }
+ }
+ if(0){
+ struct sockaddr_in self = {};
+ socklen_t self_n;
+ getsockname((*val)->sock, (struct sockaddr *)&self, &self_n);
+ dprintf("> sockname sock=%d addr=%s port=%d\n",
+ (*val)->sock, inet_ntoa(self.sin_addr), ntohs(self.sin_port));
+ }
+ exit:
+ return err;
+}
+
+/** Type for signal handling functions. */
+typedef void SignalAction(int code, siginfo_t *info, void *data);
+
+/** Handle SIGCHLD by getting child exit status.
+ * This prevents child processes being defunct.
+ *
+ * @param code signal code
+ * @param info signal info
+ * @param data
+ */
+static void sigaction_SIGCHLD(int code, siginfo_t *info, void *data){
+ int status;
+ pid_t pid;
+ pid = wait(&status);
+ dprintf("> child pid=%d status=%d\n", pid, status);
+}
+
+/** Handle SIGPIPE.
+ *
+ * @param code signal code
+ * @param info signal info
+ * @param data
+ */
+static void sigaction_SIGPIPE(int code, siginfo_t *info, void *data){
+ dprintf("> SIGPIPE\n");
+}
+
+/** Handle SIGALRM.
+ *
+ * @param code signal code
+ * @param info signal info
+ * @param data
+ */
+static void sigaction_SIGALRM(int code, siginfo_t *info, void *data){
+ //dprintf("> SIGALRM\n");
+ timer_alarms++;
+}
+
+/** Install a handler for a signal.
+ *
+ * @param signum signal
+ * @param action handler
+ * @return 0 on success, error code otherwise
+ */
+static int catch_signal(int signum, SignalAction *action){
+ int err = 0;
+ struct sigaction sig = {};
+ sig.sa_sigaction = action;
+ sig.sa_flags = SA_SIGINFO;
+ err = sigaction(signum, &sig, NULL);
+ if(err){
+ perror("sigaction");
+ }
+ return err;
+}
+
+/** Create a raw socket.
+ *
+ * @param protocol protocol
+ * @param flags flags
+ * @param sock return value for the socket
+ */
+int vnetd_raw_socket(int protocol, int flags, uint32_t mcaddr, int *sock){
+ int err;
+ int bcast = (flags & VSOCK_BROADCAST);
+ //dprintf("> protocol=%d\n", protocol);
+ err = *sock = socket(AF_INET, SOCK_RAW, protocol);
+ if(err < 0){
+ err = -errno;
+ perror("socket");
+ goto exit;
+ }
+ if(bcast){
+ err = setsock_broadcast(*sock, bcast);
+ if(err < 0) goto exit;
+ }
+ if(flags & VSOCK_MULTICAST){
+ err = setsock_multicast(*sock, mcaddr);
+ if(err < 0) goto exit;
+ }
+ exit:
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Connect to peer vnetds.
+ *
+ * @param vnetd vnetd
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_peers(Vnetd *vnetd){
+ int err =0;
+ Sxpr x, l;
+ struct in_addr addr = {};
+ for(l = vnetd->peers; CONSP(l); l = CDR(l)){
+ x = CAR(l);
+ addr.s_addr = OBJ_INT(x);
+ vnetd_connect(vnetd, addr, vnetd->peer_port);
+ }
+ return err;
+}
+
+/** Vnet daemon main program.
+ *
+ * @param vnetd program arguments
+ * @return 0 on success, error code otherwise
+ */
+int vnetd_main(Vnetd *vnetd){
+ int err = 0;
+
+ //dprintf(">\n");
+ err = get_self_addr(&vnetd->addr);
+ vnetd->addr.sin_port = vnetd->port;
+ iprintf("> VNETD\n");
+ iprintf("> addr=%s port=%u\n",
+ inet_ntoa(vnetd->addr.sin_addr), htons(vnetd->port));
+ iprintf("> mcaddr=%s port=%u\n",
+ inet_ntoa(vnetd->mcast_addr.sin_addr), htons(vnetd->port));
+ iprintf("> peers port=%u ", htons(vnetd->peer_port));
+ objprint(iostdout, vnetd->peers, 0); printf("\n");
+
+ err = vcache_init();
+ err = vnetd_peers(vnetd);
+
+ catch_signal(SIGCHLD,sigaction_SIGCHLD);
+ catch_signal(SIGPIPE,sigaction_SIGPIPE);
+ catch_signal(SIGALRM,sigaction_SIGALRM);
+ err = vnetd_listen_conn(vnetd, &vnetd->listen_conn);
+ if(err < 0) goto exit;
+ err = vnetd_udp_conn(vnetd, &vnetd->udp_conn);
+ if(err < 0) goto exit;
+ err = vnetd_broadcast_conn(vnetd, &vnetd->bcast_conn);
+ if(err < 0) goto exit;
+ {
+ int flags = VSOCK_BROADCAST | VSOCK_MULTICAST;
+ uint32_t mcaddr = vnetd->mcast_addr.sin_addr.s_addr;
+
+ err = vnetd_raw_socket(IPPROTO_ETHERIP, flags, mcaddr, &vnetd->etherip_sock);
+ if(err < 0) goto exit;
+ err = vnetd_raw_socket(IPPROTO_ESP, flags, mcaddr, &vnetd->esp_sock);
+ if(err < 0) goto exit;
+ }
+ err = vnetd_select(vnetd);
+ exit:
+ Conn_close(vnetd->listen_conn);
+ Conn_close(vnetd->udp_conn);
+ Conn_close(vnetd->bcast_conn);
+ connections_close_all(vnetd);
+ close(vnetd->etherip_sock);
+ close(vnetd->esp_sock);
+ //dprintf("< err=%d\n", err);
+ return err;
+}
+
+/** Parse command-line arguments and call the vnetd main program.
+ *
+ * @param arg argument count
+ * @param argv arguments
+ * @return 0 on success, 1 otherwise
+ */
+extern int main(int argc, char *argv[]){
+ int err = 0;
+ int key = 0;
+ int long_index = 0;
+
+ vnetd_set_defaults(vnetd);
+ while(1){
+ key = getopt_long(argc, argv, short_opts, long_opts, &long_index);
+ if(key == -1) break;
+ switch(key){
+ case OPT_ADDR:{
+ unsigned long addr;
+ err = get_host_address(optarg, &addr);
+ if(err) goto exit;
+ vnetd->mcast_addr.sin_addr.s_addr = addr;
+ break; }
+ case OPT_PORT:
+ err = convert_service_to_port(optarg, &vnetd->port);
+ if(err) goto exit;
+ break;
+ case OPT_PEER:{
+ unsigned long addr;
+ err = get_host_address(optarg, &addr);
+ if(err) goto exit;
+ //cons_push(&vnetd->peers, mkaddress(addr));
+ cons_push(&vnetd->peers, mkint(addr));
+ break; }
+ case OPT_HELP:
+ usage(0);
+ break;
+ case OPT_VERBOSE:
+ vnetd->verbose = TRUE;
+ break;
+ case OPT_VERSION:
+ iprintf("> %s %s\n", PROGRAM, VERSION);
+ exit(0);
+ break;
+ default:
+ usage(EINVAL);
+ break;
+ }
+ }
+ err = vnetd_main(vnetd);
+ exit:
+ if(err && key > 0){
+ eprintf("> Error in arg %c\n", key);
+ }
+ return (err ? 1 : 0);
+}
--- /dev/null
+/*
+ * Copyright (C) 2004 Mike Wray <mike.wray@hp.com>.
+ *
+ * This library is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of the
+ * License, or (at your option) any later version. This library is
+ * distributed in the hope that it will be useful, but WITHOUT ANY
+ * WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+#ifndef _VNET_VNETD_H_
+#define _VNET_VNETD_H_
+
+#include <asm/types.h>
+#include <linux/if_ether.h>
+#include "if_varp.h"
+
+#include "connection.h"
+#include "sxpr.h"
+
+/** Vnetd udp port in host order. */
+#define VNETD_PORT VARP_PORT
+
+/** Vnetd peer port in host order. */
+#define VNETD_PEER_PORT (VARP_PORT + 1)
+
+typedef struct VnetMsgVarp {
+ VarpHdr varph;
+} VnetMsgVarp;
+
+#define VNET_FWD_MAX (1500 + 200)
+
+typedef struct VnetMsgFwd {
+ VnetMsgHdr;
+ uint16_t protocol;
+ uint16_t len;
+ uint8_t data[VNET_FWD_MAX];
+} __attribute__((packed)) VnetMsgFwd;
+
+typedef union VnetMsg {
+ VnetMsgHdr hdr;
+ VnetMsgVarp varp;
+ VnetMsgFwd fwd;
+} VnetMsg;
+
+enum {
+ VNET_VARP_ID = VARP_ID,
+ VNET_FWD_ID = 200,
+};
+
+typedef struct Vnetd {
+ unsigned long port;
+ unsigned long peer_port;
+ int verbose;
+
+ int esp_sock;
+ int etherip_sock;
+
+ struct sockaddr_in addr;
+ struct sockaddr_in mcast_addr;
+
+ Sxpr peers;
+
+ Conn *listen_conn;
+ Conn *udp_conn;
+ Conn *bcast_conn;
+
+ ConnList *connections;
+
+} Vnetd;
+
+extern Vnetd *vnetd;
+
+#endif /* ! _VNET_VNETD_H_ */